em: Fix potential endless loop in attach

2019-09-02 Thread Stefan Fritsch
If the NIC is in some error state during device attach (seen on a i219LM 
when em_read_phy_reg_ex() returns at "MDI Error"), it can happen that we 
loop endlessly because the loop variable is modified again down in the 
call stack:

em_match_gig_phy() -> em_read_phy_reg() -> em_access_phy_reg_hv() -> sets 
hw->phy_addr

Fixing the underlying issue needs some more debugging. But we can use a 
separate variable to avoid the hang. The attach will then error out with 
"Hardware Initialization Failed".

OK?


diff --git a/sys/dev/pci/if_em_hw.c b/sys/dev/pci/if_em_hw.c
index 77adfe5707b..ccbc620739d 100644
--- a/sys/dev/pci/if_em_hw.c
+++ b/sys/dev/pci/if_em_hw.c
@@ -5810,7 +5810,12 @@ em_detect_gig_phy(struct em_hw *hw)
}
 
/* Read the PHY ID Registers to identify which PHY is onboard. */
-   for (hw->phy_addr = 1; (hw->phy_addr < 4); hw->phy_addr++) {
+   for (int i = 1; i < 4; i++) {
+   /*
+* hw->phy_addr may be modified down in the call stack,
+* we can't use it as loop variable.
+*/
+   hw->phy_addr = i;
ret_val = em_match_gig_phy(hw);
if (ret_val == E1000_SUCCESS)
return E1000_SUCCESS;



bnxt: Support MSI-X

2019-09-02 Thread Stefan Fritsch
Tested with a BCM57412

OK?

diff --git a/sys/dev/pci/if_bnxt.c b/sys/dev/pci/if_bnxt.c
--- a/sys/dev/pci/if_bnxt.c
+++ b/sys/dev/pci/if_bnxt.c
@@ -102,6 +102,7 @@
 #define BNXT_FLAG_NPAR  0x0002
 #define BNXT_FLAG_WOL_CAP   0x0004
 #define BNXT_FLAG_SHORT_CMD 0x0008
+#define BNXT_FLAG_MSIX  0x0010
 
 /* NVRam stuff has a five minute timeout */
 #define BNXT_NVM_TIMEO (5 * 60 * 1000)
@@ -507,7 +508,9 @@ bnxt_attach(struct device *parent, struct device *self, 
void *aux)
 * devices advertise msi support, but there's no way to tell a
 * completion queue to use msi mode, only legacy or msi-x.
 */
-   if (/*pci_intr_map_msi(pa, ) != 0 && */ pci_intr_map(pa, ) != 0) {
+   if (pci_intr_map_msix(pa, 0, ) == 0) {
+   sc->sc_flags |= BNXT_FLAG_MSIX;
+   } else if (pci_intr_map(pa, ) != 0) {
printf(": unable to map interrupt\n");
goto free_resp;
}
@@ -2658,7 +2661,9 @@ bnxt_hwrm_ring_alloc(struct bnxt_softc *softc, uint8_t 
type,
req.logical_id = htole16(ring->id);
req.cmpl_ring_id = htole16(cmpl_ring_id);
req.queue_id = htole16(softc->sc_q_info[0].id);
-   req.int_mode = 0;
+   req.int_mode = (softc->sc_flags & BNXT_FLAG_MSIX) ?
+   HWRM_RING_ALLOC_INPUT_INT_MODE_MSIX :
+   HWRM_RING_ALLOC_INPUT_INT_MODE_LEGACY;
BNXT_HWRM_LOCK(softc);
rc = _hwrm_send_message(softc, , sizeof(req));
if (rc)



MSI-X suspend/resume

2019-06-02 Thread Stefan Fritsch
On Fri, May 31, 2019 at 07:35:41AM +1000, Jonathan Matthew wrote:
> I had wondered about the MAU32 define when I was looking at MSI-X
> suspend/resume.

I have also taken a try at MSI-X suspend/resume in the last few weeks.
The diff below is what I have come up with. It worked with nvme on a
laptop and on qemu (but I have not tested it since rebasing on kettenis'
32bit fix).

I have also seen that arm64 got MSI-X in the meantime. That needs to be
adjusted, too.

Cheers,
Stefan


index 8a456f72b09..0b99fe8660d 100644
--- a/sys/arch/amd64/include/pci_machdep.h
+++ b/sys/arch/amd64/include/pci_machdep.h
@@ -98,6 +98,8 @@ void  pci_set_powerstate_md(pci_chipset_tag_t, 
pcitag_t, int, int);
 void   pci_mcfg_init(bus_space_tag_t, bus_addr_t, int, int, int);
 pci_chipset_tag_t pci_lookup_segment(int);
 
+#define __HAVE_PCI_MSIX
+
 /*
  * ALL OF THE FOLLOWING ARE MACHINE-DEPENDENT, AND SHOULD NOT BE USED
  * BY PORTABLE CODE.
diff --git a/sys/arch/amd64/pci/pci_machdep.c b/sys/arch/amd64/pci/pci_machdep.c
index 976ef2d3b6b..4dd95aedd21 100644
--- a/sys/arch/amd64/pci/pci_machdep.c
+++ b/sys/arch/amd64/pci/pci_machdep.c
@@ -446,81 +446,85 @@ msix_hwunmask(struct pic *pic, int pin)
 {
 }
 
+int
+pci_msix_table_map(struct pci_msix_table_info *ti)
+{
+   int bir;
+
+   if (!pci_get_capability(ti->pc, ti->tag, PCI_CAP_MSIX, >cap_offset,
+   >cap_reg)) {
+   return 0;
+   }
+   ti->memt = X86_BUS_SPACE_MEM; /* XXX */
+   ti->table = pci_conf_read(ti->pc, ti->tag,
+   ti->cap_offset + PCI_MSIX_TABLE);
+   bir = (ti->table & PCI_MSIX_TABLE_BIR);
+
+   ti->table_offset = (ti->table & PCI_MSIX_TABLE_OFF);
+   ti->table_size = PCI_MSIX_MC_TBLSZ(ti->cap_reg) + 1;
+
+   bir = PCI_MAPREG_START + bir * 4;
+   if (pci_mem_find(ti->pc, ti->tag, bir, >base, NULL, NULL) ||
+   _bus_space_map(ti->memt, ti->base + ti->table_offset,
+   ti->table_size * 16, 0, >memh)) {
+   panic("%s: cannot map registers", __func__);
+   }
+   return 1;
+}
+
+void
+pci_msix_table_unmap(struct pci_msix_table_info *ti)
+{
+   _bus_space_unmap(ti->memt, ti->memh, ti->table_size * 16, NULL);
+}
+
 void
 msix_addroute(struct pic *pic, struct cpu_info *ci, int pin, int vec, int type)
 {
-   pci_chipset_tag_t pc = NULL; /* XXX */
-   bus_space_tag_t memt = X86_BUS_SPACE_MEM; /* XXX */
-   bus_space_handle_t memh;
-   bus_addr_t base;
-   pcitag_t tag = PCI_MSIX_TAG(pin);
+   struct pci_msix_table_info ti = {
+   .pc = NULL, /* XXX */
+   .tag = PCI_MSIX_TAG(pin),
+   };
int entry = PCI_MSIX_VEC(pin);
-   pcireg_t reg, addr, table;
+   pcireg_t addr;
uint32_t ctrl;
-   int bir, offset;
-   int off, tblsz;
 
-   if (pci_get_capability(pc, tag, PCI_CAP_MSIX, , ) == 0)
+   if (!pci_msix_table_map())
panic("%s: no msix capability", __func__);
 
addr = 0xfee0UL | (ci->ci_apicid << 12);
 
-   table = pci_conf_read(pc, tag, off + PCI_MSIX_TABLE);
-   bir = (table & PCI_MSIX_TABLE_BIR);
-   offset = (table & PCI_MSIX_TABLE_OFF);
-   tblsz = PCI_MSIX_MC_TBLSZ(reg) + 1;
-
-   bir = PCI_MAPREG_START + bir * 4;
-   if (pci_mem_find(pc, tag, bir, , NULL, NULL) ||
-   _bus_space_map(memt, base + offset, tblsz * 16, 0, ))
-   panic("%s: cannot map registers", __func__);
-
-   bus_space_write_4(memt, memh, PCI_MSIX_MA(entry), addr);
-   bus_space_write_4(memt, memh, PCI_MSIX_MAU32(entry), 0);
-   bus_space_write_4(memt, memh, PCI_MSIX_MD(entry), vec);
-   bus_space_barrier(memt, memh, PCI_MSIX_MA(entry), 16,
+   bus_space_write_4(ti.memt, ti.memh, PCI_MSIX_MA(entry), addr);
+   bus_space_write_4(ti.memt, ti.memh, PCI_MSIX_MAU32(entry), 0);
+   bus_space_write_4(ti.memt, ti.memh, PCI_MSIX_MD(entry), vec);
+   bus_space_barrier(ti.memt, ti.memh, PCI_MSIX_MA(entry), 16,
BUS_SPACE_BARRIER_WRITE);
-   ctrl = bus_space_read_4(memt, memh, PCI_MSIX_VC(entry));
-   bus_space_write_4(memt, memh, PCI_MSIX_VC(entry),
+   ctrl = bus_space_read_4(ti.memt, ti.memh, PCI_MSIX_VC(entry));
+   bus_space_write_4(ti.memt, ti.memh, PCI_MSIX_VC(entry),
ctrl & ~PCI_MSIX_VC_MASK);
-
-   _bus_space_unmap(memt, memh, tblsz * 16, NULL);
-
-   pci_conf_write(pc, tag, off, reg | PCI_MSIX_MC_MSIXE);
+   pci_conf_write(ti.pc, ti.tag, ti.cap_offset,
+   ti.cap_reg | PCI_MSIX_MC_MSIXE);
+   pci_msix_table_unmap();
 }
 
 void
 msix_delroute(struct pic *pic, struct cpu_info *ci, int pin, int vec, int type)
 {
-   pci_chipset_tag_t pc = NULL; /* XXX */
-   bus_space_tag_t memt = X86_BUS_SPACE_MEM; /* XXX */
-   bus_space_handle_t memh;
-   bus_addr_t base;
-   pcitag_t tag = PCI_MSIX_TAG(pin);
+   struct pci_msix_table_info ti = {
+   .pc = NULL, /* XXX */
+   .tag = PCI_MSIX_TAG(pin),

[PATCH 2/7] virtio: Prepare for 64 feature bits

2019-01-19 Thread Stefan Fritsch
virtio 1.0 supports an arbitrary number of feature bits. However, so far
no more than 64 are used (compared to 32 in virtio 0.9). Adjust data
types to support 64 feature bits.

Later, we may want to use bitmaps and setbit(), ... to support even more
feature bits.
---
 sys/dev/fdt/virtio_mmio.c |  8 
 sys/dev/pci/virtio_pci.c  |  8 
 sys/dev/pv/if_vio.c   | 40 +++
 sys/dev/pv/vioblk.c   |  2 +-
 sys/dev/pv/vioblkreg.h| 18 +-
 sys/dev/pv/viocon.c   |  6 +++---
 sys/dev/pv/viomb.c|  4 ++--
 sys/dev/pv/vioscsireg.h   |  4 ++--
 sys/dev/pv/virtio.c   |  6 +++---
 sys/dev/pv/virtioreg.h| 10 +-
 sys/dev/pv/virtiovar.h|  6 +++---
 sys/dev/pv/vmmci.c|  8 
 12 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/sys/dev/fdt/virtio_mmio.c b/sys/dev/fdt/virtio_mmio.c
index 3a74e647c19..2d7a131125d 100644
--- a/sys/dev/fdt/virtio_mmio.c
+++ b/sys/dev/fdt/virtio_mmio.c
@@ -91,7 +91,7 @@ void  virtio_mmio_write_device_config_8(struct 
virtio_softc *, int, uint64_t);
 uint16_t   virtio_mmio_read_queue_size(struct virtio_softc *, uint16_t);
 void   virtio_mmio_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_mmio_set_status(struct virtio_softc *, int);
-uint32_t   virtio_mmio_negotiate_features(struct virtio_softc *, uint32_t,
+uint64_t   virtio_mmio_negotiate_features(struct virtio_softc *, uint64_t,
  const struct virtio_feature_name 
*);
 intvirtio_mmio_intr(void *);
 
@@ -300,12 +300,12 @@ virtio_mmio_detach(struct device *self, int flags)
  * Prints available / negotiated features if guest_feature_names != NULL and
  * VIRTIO_DEBUG is 1
  */
-uint32_t
-virtio_mmio_negotiate_features(struct virtio_softc *vsc, uint32_t 
guest_features,
+uint64_t
+virtio_mmio_negotiate_features(struct virtio_softc *vsc, uint64_t 
guest_features,
  const struct virtio_feature_name *guest_feature_names)
 {
struct virtio_mmio_softc *sc = (struct virtio_mmio_softc *)vsc;
-   uint32_t host, neg;
+   uint64_t host, neg;
 
/*
 * indirect descriptors can be switched off by setting bit 1 in the
diff --git a/sys/dev/pci/virtio_pci.c b/sys/dev/pci/virtio_pci.c
index d1ab7481df6..97e5607a55e 100644
--- a/sys/dev/pci/virtio_pci.c
+++ b/sys/dev/pci/virtio_pci.c
@@ -66,7 +66,7 @@ void  virtio_pci_write_device_config_8(struct 
virtio_softc *, int, uint64_t);
 uint16_t   virtio_pci_read_queue_size(struct virtio_softc *, uint16_t);
 void   virtio_pci_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_pci_set_status(struct virtio_softc *, int);
-uint32_t   virtio_pci_negotiate_features(struct virtio_softc *, uint32_t,
+uint64_t   virtio_pci_negotiate_features(struct virtio_softc *, uint64_t,
  const struct virtio_feature_name 
*);
 intvirtio_pci_msix_establish(struct virtio_pci_softc *, struct 
pci_attach_args *, int, int (*)(void *), void *);
 intvirtio_pci_setup_msix(struct virtio_pci_softc *, struct 
pci_attach_args *, int);
@@ -317,12 +317,12 @@ virtio_pci_detach(struct device *self, int flags)
  * Prints available / negotiated features if guest_feature_names != NULL and
  * VIRTIO_DEBUG is 1
  */
-uint32_t
-virtio_pci_negotiate_features(struct virtio_softc *vsc, uint32_t 
guest_features,
+uint64_t
+virtio_pci_negotiate_features(struct virtio_softc *vsc, uint64_t 
guest_features,
 const struct virtio_feature_name *guest_feature_names)
 {
struct virtio_pci_softc *sc = (struct virtio_pci_softc *)vsc;
-   uint32_t host, neg;
+   uint64_t host, neg;
 
/*
 * indirect descriptors can be switched off by setting bit 1 in the
diff --git a/sys/dev/pv/if_vio.c b/sys/dev/pv/if_vio.c
index d79935ca512..a4cd80af62d 100644
--- a/sys/dev/pv/if_vio.c
+++ b/sys/dev/pv/if_vio.c
@@ -68,25 +68,25 @@
 #define VIRTIO_NET_CONFIG_STATUS   6 /* 16bit */
 
 /* Feature bits */
-#define VIRTIO_NET_F_CSUM  (1<<0)
-#define VIRTIO_NET_F_GUEST_CSUM(1<<1)
-#define VIRTIO_NET_F_MAC   (1<<5)
-#define VIRTIO_NET_F_GSO   (1<<6)
-#define VIRTIO_NET_F_GUEST_TSO4(1<<7)
-#define VIRTIO_NET_F_GUEST_TSO6(1<<8)
-#define VIRTIO_NET_F_GUEST_ECN (1<<9)
-#define VIRTIO_NET_F_GUEST_UFO (1<<10)
-#define VIRTIO_NET_F_HOST_TSO4 (1<<11)
-#define VIRTIO_NET_F_HOST_TSO6 (1<<12)
-#define VIRTIO_NET_F_HOST_ECN  (1<<13)
-#define VIRTIO_NET_F_HOST_UFO  (1<<14)
-#define VIRTIO_NET_F_MRG_RXBUF (1<<15)
-#define VIRTIO_NET_F_STATUS(1<<16)
-#define VIRTIO_NET_F_CTRL_VQ   (1<<17)
-#define VIRTIO_NET_F_CTRL_RX   (1<<18)
-#define VIRTIO_NET_F_CTRL_VLAN (1<<19)

[PATCH 6/7] Rework virtio_negotiate_features()

2019-01-19 Thread Stefan Fritsch
Add a sc_driver_features field that is automatically used by
virtio_negotiate_features() and during reinit.

Make virtio_negotiate_features() return an error code.  Virtio 1.0 has a
special status bit for feature negotiation that means that negotiation
can fail. Make virtio_negotiate_features() return an error code instead
of the features.

Make virtio_reinit_start() automatically call
virtio_negotiate_features().

Add a convenience function virtio_has_feature() to make checking bits
easier.

Add an error check in viomb for virtio_negotiate_features because it has
some feature bits that may cause negotiation to fail. More error
checking in the child drivers is still missing.
---
 sys/dev/fdt/virtio_mmio.c | 38 ++
 sys/dev/pci/virtio_pci.c  | 39 +++
 sys/dev/pv/if_vio.c   | 38 +-
 sys/dev/pv/vioblk.c   | 25 +
 sys/dev/pv/viocon.c   |  6 +++---
 sys/dev/pv/viomb.c|  9 +
 sys/dev/pv/viornd.c   |  2 +-
 sys/dev/pv/vioscsi.c  |  2 +-
 sys/dev/pv/virtio.c   | 13 +++--
 sys/dev/pv/virtiovar.h| 15 ---
 sys/dev/pv/vmmci.c| 13 ++---
 11 files changed, 110 insertions(+), 90 deletions(-)

diff --git a/sys/dev/fdt/virtio_mmio.c b/sys/dev/fdt/virtio_mmio.c
index 2d7a131125d..6e09fb0e76f 100644
--- a/sys/dev/fdt/virtio_mmio.c
+++ b/sys/dev/fdt/virtio_mmio.c
@@ -91,8 +91,8 @@ void  virtio_mmio_write_device_config_8(struct 
virtio_softc *, int, uint64_t);
 uint16_t   virtio_mmio_read_queue_size(struct virtio_softc *, uint16_t);
 void   virtio_mmio_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_mmio_set_status(struct virtio_softc *, int);
-uint64_t   virtio_mmio_negotiate_features(struct virtio_softc *, uint64_t,
- const struct virtio_feature_name 
*);
+intvirtio_mmio_negotiate_features(struct virtio_softc *,
+const struct virtio_feature_name *);
 intvirtio_mmio_intr(void *);
 
 struct virtio_mmio_softc {
@@ -300,28 +300,42 @@ virtio_mmio_detach(struct device *self, int flags)
  * Prints available / negotiated features if guest_feature_names != NULL and
  * VIRTIO_DEBUG is 1
  */
-uint64_t
-virtio_mmio_negotiate_features(struct virtio_softc *vsc, uint64_t 
guest_features,
- const struct virtio_feature_name *guest_feature_names)
+int
+virtio_mmio_negotiate_features(struct virtio_softc *vsc,
+const struct virtio_feature_name *guest_feature_names)
 {
struct virtio_mmio_softc *sc = (struct virtio_mmio_softc *)vsc;
uint64_t host, neg;
 
+   vsc->sc_active_features = 0;
+
/*
-* indirect descriptors can be switched off by setting bit 1 in the
-* driver flags, see config(8)
+* We enable indirect descriptors by default. They can be switched
+* off by setting bit 1 in the driver flags, see config(8).
 */
if (!(vsc->sc_dev.dv_cfdata->cf_flags & VIRTIO_CF_NO_INDIRECT) &&
!(vsc->sc_child->dv_cfdata->cf_flags & VIRTIO_CF_NO_INDIRECT)) {
-   guest_features |= VIRTIO_F_RING_INDIRECT_DESC;
-   } else {
+   vsc->sc_driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
+   } else if (guest_feature_names != NULL) {
printf("RingIndirectDesc disabled by UKC\n");
}
+   /*
+* The driver must add VIRTIO_F_RING_EVENT_IDX if it supports it.
+* If it did, check if it is disabled by bit 2 in the driver flags.
+*/
+   if ((vsc->sc_driver_features & VIRTIO_F_RING_EVENT_IDX) &&
+   ((vsc->sc_dev.dv_cfdata->cf_flags & VIRTIO_CF_NO_EVENT_IDX) ||
+   (vsc->sc_child->dv_cfdata->cf_flags & VIRTIO_CF_NO_EVENT_IDX))) {
+   if (guest_feature_names != NULL)
+   printf(" RingEventIdx disabled by UKC");
+   vsc->sc_driver_features &= ~(VIRTIO_F_RING_EVENT_IDX);
+   }
+
bus_space_write_4(sc->sc_iot, sc->sc_ioh,
VIRTIO_MMIO_HOST_FEATURES_SEL, 0);
host = bus_space_read_4(sc->sc_iot, sc->sc_ioh,
VIRTIO_MMIO_HOST_FEATURES);
-   neg = host & guest_features;
+   neg = host & vsc->sc_driver_features;
 #if VIRTIO_DEBUG
if (guest_feature_names)
virtio_log_features(host, neg, guest_feature_names);
@@ -330,13 +344,13 @@ virtio_mmio_negotiate_features(struct virtio_softc *vsc, 
uint64_t guest_features
VIRTIO_MMIO_GUEST_FEATURES_SEL, 0);
bus_space_write_4(sc->sc_iot, sc->sc_ioh,
  VIRTIO_MMIO_GUEST_FEATURES, neg);
-   vsc->sc_features = neg;
+   vsc->sc_active_features = neg;
if (neg & VIRTIO_F_RING_INDIRECT_DESC)
vsc->sc_indirect = 1;
else
vsc->sc_indirect = 0;
 
-   return neg;
+ 

[PATCH 7/7] Support virtio 1.0 for virtio_pci

2019-01-19 Thread Stefan Fritsch
virtio 1.0 for virtio_mmio it not yet implemented, but 0.9 devices
continue to work.
---
 share/man/man4/virtio.4 |  11 +-
 sys/dev/pci/virtio_pci.c| 518 +++-
 sys/dev/pci/virtio_pcireg.h |  57 
 sys/dev/pv/if_vio.c |   9 +-
 sys/dev/pv/virtiovar.h  |   5 +
 5 files changed, 531 insertions(+), 69 deletions(-)

diff --git a/share/man/man4/virtio.4 b/share/man/man4/virtio.4
index 5e60e2e0c32..dd9d4551715 100644
--- a/share/man/man4/virtio.4
+++ b/share/man/man4/virtio.4
@@ -22,7 +22,7 @@
 .Nd VirtIO support driver
 .Sh SYNOPSIS
 .Cd "virtio* at fdt?"
-.Cd "virtio* at pci?"
+.Cd "virtio* at pci? flags 0x00"
 .Sh DESCRIPTION
 The
 .Nm
@@ -56,7 +56,14 @@ control interface
 The
 .Nm
 driver conforms to the virtio 0.9.5 specification.
-The virtio 1.0 standard is not supported, yet.
+The virtio 1.0 standard is only supported for pci devices.
+.Pp
+By default 0.9 is preferred over 1.0.
+This can be changed by setting the bit 0x4 in the flags.
+.Pp
+Setting the bit 0x8 in the flags disables the 1.0 support completely.
+.Sh BUGS
+.Nm Big-endian architectures are not yet supported.
 .Sh SEE ALSO
 .Xr intro 4
 .Sh HISTORY
diff --git a/sys/dev/pci/virtio_pci.c b/sys/dev/pci/virtio_pci.c
index f370f513e85..591da61290a 100644
--- a/sys/dev/pci/virtio_pci.c
+++ b/sys/dev/pci/virtio_pci.c
@@ -35,11 +35,16 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
+#define DNPRINTF(n,x...)   \
+do { if (VIRTIO_DEBUG >= n) printf(x); } while(0)
+
+
 /*
  * XXX: Before being used on big endian arches, the access to config registers
  * XXX: needs to be reviewed/fixed. The non-device specific registers are
@@ -52,6 +57,8 @@ struct virtio_pci_softc;
 
 intvirtio_pci_match(struct device *, void *, void *);
 void   virtio_pci_attach(struct device *, struct device *, void *);
+intvirtio_pci_attach_09(struct virtio_pci_softc *sc, struct 
pci_attach_args *pa);
+intvirtio_pci_attach_10(struct virtio_pci_softc *sc, struct 
pci_attach_args *pa);
 intvirtio_pci_detach(struct device *, int);
 
 void   virtio_pci_kick(struct virtio_softc *, uint16_t);
@@ -67,8 +74,8 @@ void  virtio_pci_write_device_config_8(struct 
virtio_softc *, int, uint64_t);
 uint16_t   virtio_pci_read_queue_size(struct virtio_softc *, uint16_t);
 void   virtio_pci_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_pci_set_status(struct virtio_softc *, int);
-intvirtio_pci_negotiate_features(struct virtio_softc *,
-const struct virtio_feature_name *);
+intvirtio_pci_negotiate_features(struct virtio_softc *, const 
struct virtio_feature_name *);
+intvirtio_pci_negotiate_features_10(struct virtio_softc *, const 
struct virtio_feature_name *);
 void   virtio_pci_set_msix_queue_vector(struct virtio_pci_softc *, 
uint32_t, uint16_t);
 void   virtio_pci_set_msix_config_vector(struct virtio_pci_softc *, 
uint16_t);
 intvirtio_pci_msix_establish(struct virtio_pci_softc *, struct 
pci_attach_args *, int, int (*)(void *), void *);
@@ -80,6 +87,10 @@ int  virtio_pci_legacy_intr_mpsafe(void *);
 intvirtio_pci_config_intr(void *);
 intvirtio_pci_queue_intr(void *);
 intvirtio_pci_shared_queue_intr(void *);
+intvirtio_pci_find_cap(struct virtio_pci_softc *sc, int cfg_type, 
void *buf, int buflen);
+#if VIRTIO_DEBUG
+void virtio_pci_dump_caps(struct virtio_pci_softc *sc);
+#endif
 
 enum irq_type {
IRQ_NO_MSIX,
@@ -96,9 +107,14 @@ struct virtio_pci_softc {
bus_space_handle_t  sc_ioh;
bus_size_t  sc_iosize;
 
+   bus_space_tag_t sc_bars_iot[4];
+   bus_space_handle_t  sc_bars_ioh[4];
+   bus_size_t  sc_bars_iosize[4];
+
bus_space_tag_t sc_notify_iot;
bus_space_handle_t  sc_notify_ioh;
bus_size_t  sc_notify_iosize;
+   unsigned intsc_notify_off_multiplier;
 
bus_space_tag_t sc_devcfg_iot;
bus_space_handle_t  sc_devcfg_ioh;
@@ -145,14 +161,69 @@ struct virtio_ops virtio_pci_ops = {
virtio_pci_poll_intr,
 };
 
+static inline
+uint64_t _cread(struct virtio_pci_softc *sc, unsigned off, unsigned size)
+{
+   uint64_t val;
+   switch (size) {
+   case 1:
+   val = bus_space_read_1(sc->sc_iot, sc->sc_ioh, off);
+   break;
+   case 2:
+   val = bus_space_read_2(sc->sc_iot, sc->sc_ioh, off);
+   break;
+   case 4:
+   val = bus_space_read_4(sc->sc_iot, sc->sc_ioh, off);
+   break;
+   case 8:
+   val = bus_space_read_8(sc->sc_iot, sc->sc_ioh, off);
+   break;
+   }
+   return val;
+}
+
+#define CREAD(sc, memb)  _cread(sc, offsetof(struct 

[PATCH 4/7] virtio_pci: Split bus space handles

2019-01-19 Thread Stefan Fritsch
In virtio_pci 1.0, different parts of the register set may be located in
different BARs. Use subregions to make the access independent of the
virtio version.
---
 sys/dev/pci/virtio_pci.c | 114 +++
 1 file changed, 80 insertions(+), 34 deletions(-)

diff --git a/sys/dev/pci/virtio_pci.c b/sys/dev/pci/virtio_pci.c
index 97e5607a55e..d69db1968d0 100644
--- a/sys/dev/pci/virtio_pci.c
+++ b/sys/dev/pci/virtio_pci.c
@@ -55,6 +55,7 @@ void  virtio_pci_attach(struct device *, struct 
device *, void *);
 intvirtio_pci_detach(struct device *, int);
 
 void   virtio_pci_kick(struct virtio_softc *, uint16_t);
+intvirtio_pci_adjust_config_region(struct virtio_pci_softc *);
 uint8_tvirtio_pci_read_device_config_1(struct virtio_softc *, 
int);
 uint16_t   virtio_pci_read_device_config_2(struct virtio_softc *, int);
 uint32_t   virtio_pci_read_device_config_4(struct virtio_softc *, int);
@@ -87,14 +88,33 @@ enum irq_type {
 struct virtio_pci_softc {
struct virtio_softc sc_sc;
pci_chipset_tag_t   sc_pc;
+   pcitag_tsc_ptag;
 
bus_space_tag_t sc_iot;
bus_space_handle_t  sc_ioh;
bus_size_t  sc_iosize;
 
+   bus_space_tag_t sc_notify_iot;
+   bus_space_handle_t  sc_notify_ioh;
+   bus_size_t  sc_notify_iosize;
+
+   bus_space_tag_t sc_devcfg_iot;
+   bus_space_handle_t  sc_devcfg_ioh;
+   bus_size_t  sc_devcfg_iosize;
+   /*
+* With 0.9, the offset of the devcfg region in the io bar changes
+* depending on MSI-X being enabled or not.
+* With 1.0, this field is still used to remember if MSI-X is enabled
+* or not.
+*/
+   unsigned intsc_devcfg_offset;
+
+   bus_space_tag_t sc_isr_iot;
+   bus_space_handle_t  sc_isr_ioh;
+   bus_size_t  sc_isr_iosize;
+
void*sc_ih[MAX_MSIX_VECS];
 
-   int sc_config_offset;
enum irq_type   sc_irq_type;
 };
 
@@ -213,9 +233,8 @@ virtio_pci_attach(struct device *parent, struct device 
*self, void *aux)
 
vsc->sc_ops = _pci_ops;
sc->sc_pc = pc;
+   sc->sc_ptag = pa->pa_tag;
vsc->sc_dmat = pa->pa_dmat;
-   sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI;
-   sc->sc_irq_type = IRQ_NO_MSIX;
 
/*
 * For virtio, ignore normal MSI black/white-listing depending on the
@@ -229,11 +248,33 @@ virtio_pci_attach(struct device *parent, struct device 
*self, void *aux)
return;
}
 
+   if (bus_space_subregion(sc->sc_iot, sc->sc_ioh,
+   VIRTIO_CONFIG_QUEUE_NOTIFY, 2, >sc_notify_ioh) != 0) {
+   printf("%s: can't map notify i/o space\n",
+   vsc->sc_dev.dv_xname);
+   return;
+   }
+   sc->sc_notify_iosize = 2;
+   sc->sc_notify_iot = sc->sc_iot;
+
+   if (bus_space_subregion(sc->sc_iot, sc->sc_ioh,
+   VIRTIO_CONFIG_ISR_STATUS, 1, >sc_isr_ioh) != 0) {
+   printf("%s: can't map isr i/o space\n",
+   vsc->sc_dev.dv_xname);
+   return;
+   }
+   sc->sc_isr_iosize = 1;
+   sc->sc_isr_iot = sc->sc_iot;
+
+   sc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI;
+   sc->sc_irq_type = IRQ_NO_MSIX;
+   if (virtio_pci_adjust_config_region(sc) != 0)
+   return;
+
virtio_device_reset(vsc);
virtio_pci_set_status(vsc, VIRTIO_CONFIG_DEVICE_STATUS_ACK);
virtio_pci_set_status(vsc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER);
 
-   /* XXX: use softc as aux... */
vsc->sc_childdevid = id;
vsc->sc_child = NULL;
config_found(self, sc, NULL);
@@ -312,6 +353,20 @@ virtio_pci_detach(struct device *self, int flags)
return 0;
 }
 
+int
+virtio_pci_adjust_config_region(struct virtio_pci_softc *sc)
+{
+   sc->sc_devcfg_iosize = sc->sc_iosize - sc->sc_devcfg_offset;
+   sc->sc_devcfg_iot = sc->sc_iot;
+   if (bus_space_subregion(sc->sc_iot, sc->sc_ioh, sc->sc_devcfg_offset,
+   sc->sc_devcfg_iosize, >sc_devcfg_ioh) != 0) {
+   printf("%s: can't map config i/o space\n",
+   sc->sc_sc.sc_dev.dv_xname);
+   return 1;
+   }
+   return 0;
+}
+
 /*
  * Feature negotiation.
  * Prints available / negotiated features if guest_feature_names != NULL and
@@ -359,24 +414,21 @@ uint8_t
 virtio_pci_read_device_config_1(struct virtio_softc *vsc, int index)
 {
struct virtio_pci_softc *sc = (struct virtio_pci_softc *)vsc;
-   return bus_space_read_1(sc->sc_iot, sc->sc_ioh,
-   sc->sc_config_offset + index);
+   return bus_space_read_1(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index);
 }
 
 uint16_t
 virtio_pci_read_device_config_2(struct virtio_softc *vsc, int 

[PATCH 5/7] virtio_pci: Move msix vector config into functions

2019-01-19 Thread Stefan Fritsch
---
 sys/dev/pci/virtio_pci.c | 38 --
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/sys/dev/pci/virtio_pci.c b/sys/dev/pci/virtio_pci.c
index d69db1968d0..7be93684a68 100644
--- a/sys/dev/pci/virtio_pci.c
+++ b/sys/dev/pci/virtio_pci.c
@@ -69,6 +69,8 @@ void  virtio_pci_setup_queue(struct virtio_softc *, 
struct virtqueue *, uint64_t
 void   virtio_pci_set_status(struct virtio_softc *, int);
 uint64_t   virtio_pci_negotiate_features(struct virtio_softc *, uint64_t,
  const struct virtio_feature_name 
*);
+void   virtio_pci_set_msix_queue_vector(struct virtio_pci_softc *, 
uint32_t, uint16_t);
+void   virtio_pci_set_msix_config_vector(struct virtio_pci_softc *, 
uint16_t);
 intvirtio_pci_msix_establish(struct virtio_pci_softc *, struct 
pci_attach_args *, int, int (*)(void *), void *);
 intvirtio_pci_setup_msix(struct virtio_pci_softc *, struct 
pci_attach_args *, int);
 void   virtio_pci_free_irqs(struct virtio_pci_softc *);
@@ -503,6 +505,22 @@ virtio_pci_msix_establish(struct virtio_pci_softc *sc,
return 0;
 }
 
+void
+virtio_pci_set_msix_queue_vector(struct virtio_pci_softc *sc, uint32_t idx, 
uint16_t vector)
+{
+   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
+   VIRTIO_CONFIG_QUEUE_SELECT, idx);
+   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
+   VIRTIO_MSI_QUEUE_VECTOR, vector);
+}
+
+void
+virtio_pci_set_msix_config_vector(struct virtio_pci_softc *sc, uint16_t vector)
+{
+   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
+   VIRTIO_MSI_CONFIG_VECTOR, vector);
+}
+
 void
 virtio_pci_free_irqs(struct virtio_pci_softc *sc)
 {
@@ -511,10 +529,8 @@ virtio_pci_free_irqs(struct virtio_pci_softc *sc)
 
if (sc->sc_devcfg_offset == VIRTIO_CONFIG_DEVICE_CONFIG_MSI) {
for (i = 0; i < vsc->sc_nvqs; i++) {
-   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
-   VIRTIO_CONFIG_QUEUE_SELECT, i);
-   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
-   VIRTIO_MSI_QUEUE_VECTOR, VIRTIO_MSI_NO_VECTOR);
+   virtio_pci_set_msix_queue_vector(sc, i,
+   VIRTIO_MSI_NO_VECTOR);
}
}
 
@@ -540,6 +556,7 @@ virtio_pci_setup_msix(struct virtio_pci_softc *sc, struct 
pci_attach_args *pa,
return 1;
sc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSI;
virtio_pci_adjust_config_region(sc);
+   virtio_pci_set_msix_config_vector(sc, 0);
 
if (shared) {
if (virtio_pci_msix_establish(sc, pa, 1,
@@ -547,22 +564,15 @@ virtio_pci_setup_msix(struct virtio_pci_softc *sc, struct 
pci_attach_args *pa,
goto fail;
}
 
-   for (i = 0; i < vsc->sc_nvqs; i++) {
-   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
-   VIRTIO_CONFIG_QUEUE_SELECT, i);
-   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
-   VIRTIO_MSI_QUEUE_VECTOR, 1);
-   }
+   for (i = 0; i < vsc->sc_nvqs; i++)
+   virtio_pci_set_msix_queue_vector(sc, i, 1);
} else {
for (i = 0; i <= vsc->sc_nvqs; i++) {
if (virtio_pci_msix_establish(sc, pa, i + 1,
virtio_pci_queue_intr, >sc_vqs[i])) {
goto fail;
}
-   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
-   VIRTIO_CONFIG_QUEUE_SELECT, i);
-   bus_space_write_2(sc->sc_iot, sc->sc_ioh,
-   VIRTIO_MSI_QUEUE_VECTOR, i + 1);
+   virtio_pci_set_msix_queue_vector(sc, i, i + 1);
}
}
 
-- 
2.19.0



[PATCH 1/7] virtio: adjust virtio_setup_queue prototype for 1.0

2019-01-19 Thread Stefan Fritsch
Make it take an address instead of a PFN.
Pass the virtqueue pointer. In virtio 1.0, more information has to be
configured in the device. Also call virtio_setup_queue() after the
information has been filled in.
---
 sys/dev/fdt/virtio_mmio.c | 11 +++
 sys/dev/pci/virtio_pci.c  | 11 ++-
 sys/dev/pv/virtio.c   | 11 ---
 sys/dev/pv/virtiovar.h|  2 +-
 4 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/sys/dev/fdt/virtio_mmio.c b/sys/dev/fdt/virtio_mmio.c
index 11867a43173..3a74e647c19 100644
--- a/sys/dev/fdt/virtio_mmio.c
+++ b/sys/dev/fdt/virtio_mmio.c
@@ -89,7 +89,7 @@ void  virtio_mmio_write_device_config_2(struct 
virtio_softc *, int, uint16_t);
 void   virtio_mmio_write_device_config_4(struct virtio_softc *, int, 
uint32_t);
 void   virtio_mmio_write_device_config_8(struct virtio_softc *, int, 
uint64_t);
 uint16_t   virtio_mmio_read_queue_size(struct virtio_softc *, uint16_t);
-void   virtio_mmio_setup_queue(struct virtio_softc *, uint16_t, 
uint32_t);
+void   virtio_mmio_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_mmio_set_status(struct virtio_softc *, int);
 uint32_t   virtio_mmio_negotiate_features(struct virtio_softc *, uint32_t,
  const struct virtio_feature_name 
*);
@@ -151,15 +151,18 @@ virtio_mmio_read_queue_size(struct virtio_softc *vsc, 
uint16_t idx)
 }
 
 void
-virtio_mmio_setup_queue(struct virtio_softc *vsc, uint16_t idx, uint32_t addr)
+virtio_mmio_setup_queue(struct virtio_softc *vsc, struct virtqueue *vq,
+uint64_t addr)
 {
struct virtio_mmio_softc *sc = (struct virtio_mmio_softc *)vsc;
-   bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_SEL, idx);
+   bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_SEL,
+   vq->vq_index);
bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_NUM,
bus_space_read_4(sc->sc_iot, sc->sc_ioh, 
VIRTIO_MMIO_QUEUE_NUM_MAX));
bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_ALIGN,
PAGE_SIZE);
-   bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_PFN, addr);
+   bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_PFN,
+   addr / VIRTIO_PAGE_SIZE);
 }
 
 void
diff --git a/sys/dev/pci/virtio_pci.c b/sys/dev/pci/virtio_pci.c
index cce99e3d720..d1ab7481df6 100644
--- a/sys/dev/pci/virtio_pci.c
+++ b/sys/dev/pci/virtio_pci.c
@@ -64,7 +64,7 @@ void  virtio_pci_write_device_config_2(struct 
virtio_softc *, int, uint16_t);
 void   virtio_pci_write_device_config_4(struct virtio_softc *, int, 
uint32_t);
 void   virtio_pci_write_device_config_8(struct virtio_softc *, int, 
uint64_t);
 uint16_t   virtio_pci_read_queue_size(struct virtio_softc *, uint16_t);
-void   virtio_pci_setup_queue(struct virtio_softc *, uint16_t, 
uint32_t);
+void   virtio_pci_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_pci_set_status(struct virtio_softc *, int);
 uint32_t   virtio_pci_negotiate_features(struct virtio_softc *, uint32_t,
  const struct virtio_feature_name 
*);
@@ -134,13 +134,14 @@ virtio_pci_read_queue_size(struct virtio_softc *vsc, 
uint16_t idx)
 }
 
 void
-virtio_pci_setup_queue(struct virtio_softc *vsc, uint16_t idx, uint32_t addr)
+virtio_pci_setup_queue(struct virtio_softc *vsc, struct virtqueue *vq,
+uint64_t addr)
 {
struct virtio_pci_softc *sc = (struct virtio_pci_softc *)vsc;
bus_space_write_2(sc->sc_iot, sc->sc_ioh, VIRTIO_CONFIG_QUEUE_SELECT,
-   idx);
+   vq->vq_index);
bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_CONFIG_QUEUE_ADDRESS,
-   addr);
+   addr / VIRTIO_PAGE_SIZE);
 
/*
 * This path is only executed if this function is called after
@@ -150,7 +151,7 @@ virtio_pci_setup_queue(struct virtio_softc *vsc, uint16_t 
idx, uint32_t addr)
if (sc->sc_irq_type != IRQ_NO_MSIX) {
int vec = 1;
if (sc->sc_irq_type == IRQ_MSIX_PER_VQ)
-  vec += idx;
+  vec += vq->vq_index;
bus_space_write_2(sc->sc_iot, sc->sc_ioh,
VIRTIO_MSI_QUEUE_VECTOR, vec);
}
diff --git a/sys/dev/pv/virtio.c b/sys/dev/pv/virtio.c
index e6cd1ace5aa..be81878642f 100644
--- a/sys/dev/pv/virtio.c
+++ b/sys/dev/pv/virtio.c
@@ -154,8 +154,7 @@ virtio_reinit_start(struct virtio_softc *sc)
sc->sc_dev.dv_xname, vq->vq_index);
}
virtio_init_vq(sc, vq, 1);
-   virtio_setup_queue(sc, vq->vq_index,
-   vq->vq_dmamap->dm_segs[0].ds_addr / VIRTIO_PAGE_SIZE);
+   virtio_setup_queue(sc, vq, vq->vq_dmamap->dm_segs[0].ds_addr);
}
 }
 
@@ -345,9 +344,6 @@ 

[PATCH 0/7] Virtio 1.0 in the kernel

2019-01-19 Thread Stefan Fritsch
Hi,

here comes the split up diff for virtio 1.0 support.  Compared to the previous
diff, I have omitted some changes in how the feature bits and negotiation
works. This also means that the feature bit defines in the headers stay the
same and vmd is not affected. I have tested that virtio_mmio on arm64 still
works.

Cheers,
Stefan



[PATCH 3/7] virtio: Add a few feature bit defines and names

2019-01-19 Thread Stefan Fritsch
---
 sys/dev/pv/if_vio.c| 84 +++---
 sys/dev/pv/vioblk.c|  3 ++
 sys/dev/pv/vioblkreg.h | 21 ++-
 sys/dev/pv/virtio.c|  1 +
 4 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/sys/dev/pv/if_vio.c b/sys/dev/pv/if_vio.c
index a4cd80af62d..bfc7cfd1ddc 100644
--- a/sys/dev/pv/if_vio.c
+++ b/sys/dev/pv/if_vio.c
@@ -68,25 +68,29 @@
 #define VIRTIO_NET_CONFIG_STATUS   6 /* 16bit */
 
 /* Feature bits */
-#define VIRTIO_NET_F_CSUM  (1ULL<<0)
-#define VIRTIO_NET_F_GUEST_CSUM(1ULL<<1)
-#define VIRTIO_NET_F_MAC   (1ULL<<5)
-#define VIRTIO_NET_F_GSO   (1ULL<<6)
-#define VIRTIO_NET_F_GUEST_TSO4(1ULL<<7)
-#define VIRTIO_NET_F_GUEST_TSO6(1ULL<<8)
-#define VIRTIO_NET_F_GUEST_ECN (1ULL<<9)
-#define VIRTIO_NET_F_GUEST_UFO (1ULL<<10)
-#define VIRTIO_NET_F_HOST_TSO4 (1ULL<<11)
-#define VIRTIO_NET_F_HOST_TSO6 (1ULL<<12)
-#define VIRTIO_NET_F_HOST_ECN  (1ULL<<13)
-#define VIRTIO_NET_F_HOST_UFO  (1ULL<<14)
-#define VIRTIO_NET_F_MRG_RXBUF (1ULL<<15)
-#define VIRTIO_NET_F_STATUS(1ULL<<16)
-#define VIRTIO_NET_F_CTRL_VQ   (1ULL<<17)
-#define VIRTIO_NET_F_CTRL_RX   (1ULL<<18)
-#define VIRTIO_NET_F_CTRL_VLAN (1ULL<<19)
-#define VIRTIO_NET_F_CTRL_RX_EXTRA (1ULL<<20)
-#define VIRTIO_NET_F_GUEST_ANNOUNCE(1ULL<<21)
+#define VIRTIO_NET_F_CSUM  (1ULL<<0)
+#define VIRTIO_NET_F_GUEST_CSUM(1ULL<<1)
+#define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS(1ULL<<2)
+#define VIRTIO_NET_F_MTU(1ULL<<3)
+#define VIRTIO_NET_F_MAC   (1ULL<<5)
+#define VIRTIO_NET_F_GSO   (1ULL<<6)
+#define VIRTIO_NET_F_GUEST_TSO4(1ULL<<7)
+#define VIRTIO_NET_F_GUEST_TSO6(1ULL<<8)
+#define VIRTIO_NET_F_GUEST_ECN (1ULL<<9)
+#define VIRTIO_NET_F_GUEST_UFO (1ULL<<10)
+#define VIRTIO_NET_F_HOST_TSO4 (1ULL<<11)
+#define VIRTIO_NET_F_HOST_TSO6 (1ULL<<12)
+#define VIRTIO_NET_F_HOST_ECN  (1ULL<<13)
+#define VIRTIO_NET_F_HOST_UFO  (1ULL<<14)
+#define VIRTIO_NET_F_MRG_RXBUF (1ULL<<15)
+#define VIRTIO_NET_F_STATUS(1ULL<<16)
+#define VIRTIO_NET_F_CTRL_VQ   (1ULL<<17)
+#define VIRTIO_NET_F_CTRL_RX   (1ULL<<18)
+#define VIRTIO_NET_F_CTRL_VLAN (1ULL<<19)
+#define VIRTIO_NET_F_CTRL_RX_EXTRA (1ULL<<20)
+#define VIRTIO_NET_F_GUEST_ANNOUNCE(1ULL<<21)
+#define VIRTIO_NET_F_MQ(1ULL<<22)
+#define VIRTIO_NET_F_CTRL_MAC_ADDR (1ULL<<23)
 
 /*
  * Config(8) flags. The lowest byte is reserved for generic virtio stuff.
@@ -97,25 +101,29 @@
 
 static const struct virtio_feature_name virtio_net_feature_names[] = {
 #if VIRTIO_DEBUG
-   { VIRTIO_NET_F_CSUM,"CSum" },
-   { VIRTIO_NET_F_GUEST_CSUM,  "GuestCSum" },
-   { VIRTIO_NET_F_MAC, "MAC" },
-   { VIRTIO_NET_F_GSO, "GSO" },
-   { VIRTIO_NET_F_GUEST_TSO4,  "GuestTSO4" },
-   { VIRTIO_NET_F_GUEST_TSO6,  "GuestTSO6" },
-   { VIRTIO_NET_F_GUEST_ECN,   "GuestECN" },
-   { VIRTIO_NET_F_GUEST_UFO,   "GuestUFO" },
-   { VIRTIO_NET_F_HOST_TSO4,   "HostTSO4" },
-   { VIRTIO_NET_F_HOST_TSO6,   "HostTSO6" },
-   { VIRTIO_NET_F_HOST_ECN,"HostECN" },
-   { VIRTIO_NET_F_HOST_UFO,"HostUFO" },
-   { VIRTIO_NET_F_MRG_RXBUF,   "MrgRXBuf" },
-   { VIRTIO_NET_F_STATUS,  "Status" },
-   { VIRTIO_NET_F_CTRL_VQ, "CtrlVQ" },
-   { VIRTIO_NET_F_CTRL_RX, "CtrlRX" },
-   { VIRTIO_NET_F_CTRL_VLAN,   "CtrlVLAN" },
-   { VIRTIO_NET_F_CTRL_RX_EXTRA,   "CtrlRXExtra" },
-   { VIRTIO_NET_F_GUEST_ANNOUNCE,  "GuestAnnounce" },
+   { VIRTIO_NET_F_CSUM,"CSum" },
+   { VIRTIO_NET_F_GUEST_CSUM,  "GuestCSum" },
+   { VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, "CtrlGuestOffl" },
+   { VIRTIO_NET_F_MTU, "MTU", },
+   { VIRTIO_NET_F_MAC, "MAC" },
+   { VIRTIO_NET_F_GSO, "GSO" },
+   { VIRTIO_NET_F_GUEST_TSO4,  "GuestTSO4" },
+   { VIRTIO_NET_F_GUEST_TSO6,  "GuestTSO6" },
+   { VIRTIO_NET_F_GUEST_ECN,   "GuestECN" },
+   { VIRTIO_NET_F_GUEST_UFO,   "GuestUFO" },
+   { VIRTIO_NET_F_HOST_TSO4,   "HostTSO4" },
+   { VIRTIO_NET_F_HOST_TSO6,   "HostTSO6" },
+   { VIRTIO_NET_F_HOST_ECN,"HostECN" },
+   { VIRTIO_NET_F_HOST_UFO,"HostUFO" },
+   { VIRTIO_NET_F_MRG_RXBUF,   "MrgRXBuf" },
+   { 

Re: Virtio 1.0 for the kernel

2019-01-19 Thread Stefan Fritsch
On Saturday, 12 January 2019 01:49:09 CET Jonathan Gray wrote:
> On Fri, Jan 11, 2019 at 03:21:11PM -0800, William Ahern wrote:
> > On Fri, Jan 11, 2019 at 10:43:25AM +0100, Stefan Fritsch wrote:
> > 
> > 
> > > /* only used for sizeof, not actually allocated */
> > > extern struct virtio_pci_common_cfg ccfg;
> > 
> > 
> > 
> > > #define CREAD(sc, memb)  _cread(sc, \
> > > 
> > > offsetof(struct virtio_pci_common_cfg, memb), sizeof(ccfg.memb))
> > > 
> > > The compiler should optimize this to the same code as the complicated
> > > macro above. You think this variant is acceptable?
> > 
> > Maybe I'm missing something, but these are the more idiomatic constructs
> > 
> >   sizeof ((struct virtio_pci_common_cfg *)0)->memb
> >   sizeof ((struct virtio_pci_common_cfg){ 0 }).memb
> 
> No, expanding the offsetof macro and avoiding __builtin_offsetof misses
> the point of it.

It allows to get rid of the "extern struct virtio_pci_common_cfg ccfg;" 
though. That's an improvement.

Stefan




Re: Virtio 1.0 for the kernel

2019-01-11 Thread Stefan Fritsch
On Fri, 11 Jan 2019, Ted Unangst wrote:

> Stefan Fritsch wrote:
> > +#define CREAD(sc, memb)
> > \
> > +   ({  
> > \
> > +   struct virtio_pci_common_cfg c; 
> > \
> > +   size_t off = offsetof(struct virtio_pci_common_cfg, memb);  
> > \
> > +   size_t size = sizeof(c.memb);   
> > \
> > +   typeof(c.memb) val; 
> > \
> > +   
> > \
> > +   switch (size) { 
> > \
> > +   case 1: 
> > \
> > +   val = bus_space_read_1(sc->sc_iot, sc->sc_ioh, off);
> > \
> > +   break;  
> > \
> > +   case 2: 
> > \
> > +   val = bus_space_read_2(sc->sc_iot, sc->sc_ioh, off);
> > \
> > +   break;  
> > \
> > +   case 4: 
> > \
> > +   val = bus_space_read_4(sc->sc_iot, sc->sc_ioh, off);
> > \
> > +   break;  
> > \
> > +   case 8: 
> > \
> > +   val = bus_space_read_8(sc->sc_iot, sc->sc_ioh, off);
> > \
> > +   break;  
> > \
> > +   }   
> > \
> > +   DNPRINTF(2, "%s: %d: off %#zx size %#zx read %#llx\n",  
> > \
> > +  __func__, __LINE__, off, size, (unsigned long long)val); 
> > \
> > +   val;
> > \
> > +   })
> 
> I'm not a big fan of statement expressions. I thought we purged most of them
> from the tree in fact. I would say just use an inline. And no typeof.
> 
> (There's only one typeof in sys/ outside of drm, and I added it, but no more
> please.)
> 

A simple inline function does not work because memb is the name of a 
member of a struct. But combined with a macro it would work. Like this:


/* only used for sizeof, not actually allocated */
extern struct virtio_pci_common_cfg ccfg;

static inline
uint64_t _cread(struct virtio_pci_softc *sc, unsigned off, unsigned size)
{
uint64_t val;
switch (size) {
case 1:
val = bus_space_read_1(sc->sc_iot, sc->sc_ioh, off);
break;
case 2:
val = bus_space_read_2(sc->sc_iot, sc->sc_ioh, off);
break;
case 4:
val = bus_space_read_4(sc->sc_iot, sc->sc_ioh, off);
break;
case 8:
val = bus_space_read_8(sc->sc_iot, sc->sc_ioh, off);
break;
}
return val;
}

#define CREAD(sc, memb)  _cread(sc, \
offsetof(struct virtio_pci_common_cfg, memb), sizeof(ccfg.memb))


The compiler should optimize this to the same code as the complicated 
macro above. You think this variant is acceptable?

Cheers,
Stefan



Re: Virtio 1.0 for the kernel

2019-01-11 Thread Stefan Fritsch
Hi Carlos,

> 
> I'm getting ramped back up from being AWOL for several months, so
> apologies in advance for the delay of going through your diff (and the
> virtio header re-order one that you fixed).  It'll be over the weekend
> at the earliest before I can review/test it out.

There is no hurry, take your time. I will split the diff up into smaller 
chunks, so you may wait for that with a detailed review. But opinions 
about the offsetof hackery are welcome before that.

Cheers,
Stefan



Re: Virtio 1.0 for the kernel

2019-01-11 Thread Stefan Fritsch
On Fri, 11 Jan 2019, Jonathan Gray wrote:
> Assuming a qcow2 image created by something along the lines of
> 'qemu-img create -f qcow2 root.qcow2 2g' miniroot64.fs from an arm64
> snapshot and u-boot for qemu_arm64 from the u-boot-aarch64 package:
> 
> doas sh -c "qemu-system-aarch64 -runas $USER \
>   -M virt -serial stdio -m 1024 -cpu cortex-a57 \
>   -bios /usr/local/share/u-boot/qemu_arm64/u-boot.bin \
>   -device virtio-rng-device -netdev tap,id=net0 \
>   -device virtio-net-device,netdev=net0 \
>   -drive file=miniroot64.fs,if=none,id=drive0,format=raw \
>   -drive file=root.qcow2,if=none,id=drive1,format=qcow2 \
>   -device ich9-ahci,id=ahci \
>   -device ide-drive,drive=drive0,bus=ahci.0 \
>   -device ide-drive,drive=drive1,bus=ahci.1"

Thanks, that works. I had tried with some efi firmware before and not with 
u-boot.

Cheers,
Stefan



Re: Virtio 1.0 for the kernel

2019-01-11 Thread Stefan Fritsch
On Thu, 10 Jan 2019, Theo de Raadt wrote:

> arm64 also uses this subsystem, and as a result this diff breaks
> all those kernels.

The diff also breaks vmd. Even if it compiles it will still be broken. As 
I wrote, it's not ready for commit yet.

> 
> You ask how to run arm64 Uhm, you didn't even try to compile it.
> 
> 



Virtio 1.0 for the kernel

2019-01-10 Thread Stefan Fritsch
Hi,

the diff below implements the virtio 1.0 standard in the kernel (0.9 keeps 
working, of course). It's not ready for commit, yet, but I would like some 
input.

For the most part, the changes from 0.9 to 1.0 are not that big, but there 
are some notable differences. Since some headers are also used by vmd in 
userspace, how those things are handled in the kernel are also relevant to 
vmd.

One change is that the number of feature bits is no longer limited to 32. 
This means in the long run, defining constants with values that have only 
the relevant feature bit set won't work anymore. So eventually, we will 
have to change the constants to contain the bit offsets instead. Then we 
can add some macros to make handling easier. This is what the attached 
diff does. However, in the nearer future 64 bits will be sufficient and 
another possibility is to continue as now and define features as uint64_t 
values. So, should we make the switch from "#define foo (1<<24)" to 
"#define foo 24" now or at some later time, when devices start using more 
than 64 bits?

The biggest change in 1.0 is the way the resources are found and accessed 
for virtio_pci devices. This means that all the offsets for the generic 
configuration registers change between 0.9 and 1.0. One possibility would 
be to just define a second set of constants and use that for 1.0. However, 
since the register layout is documented as structs in the standard, I have 
tried something different and used some macros (CREAD/CWRITE in 
virtio_pci.c) and offsetof() magic to determine the register offset and 
width for an access. This has the advantage that the widths are 
automatically taken care of, whereas in the old style one has to manually 
pick the correct bus_dma_read/write_* function. However, I don't like the 
0.9 code to use the one style and the 1.0 code to use the other style. I 
also don't know how suitable the offsetof hackery would be for vmd. So I 
would like input on which way you think is preferable?

The diff does not implement 1.0 for virtio/mmio, yet. I hope that it does 
not break 0.9 virtio/mmio, but I have not tested that. So far I did not 
have success in running openbsd arm or aarch64 on qemu (openbsd panics). 
Any pointers about this?

So far, I have only tested the net, block and rng devices. It's possible 
that the other devices need some more minor tweaks, too.

And in case anyone wonders, there is no killer feature in virtio 1.0 that 
we really want to have. It's just that we run on hypervisors that don't 
support 0.9, and we run better on systems where 1.0-only is the default 
(libvirt does that when the newer qemu machine type q35 is used :-( ).

Cheers,
Stefan


diff --git a/share/man/man4/virtio.4 b/share/man/man4/virtio.4
index 5e60e2e0c32..6ee23a7b47d 100644
--- a/share/man/man4/virtio.4
+++ b/share/man/man4/virtio.4
@@ -22,7 +22,7 @@
 .Nd VirtIO support driver
 .Sh SYNOPSIS
 .Cd "virtio* at fdt?"
-.Cd "virtio* at pci?"
+.Cd "virtio* at pci? flags 0x00"
 .Sh DESCRIPTION
 The
 .Nm
@@ -56,7 +56,12 @@ control interface
 The
 .Nm
 driver conforms to the virtio 0.9.5 specification.
-The virtio 1.0 standard is not supported, yet.
+The virtio 1.0 standard is only supported for pci devices.
+.Pp
+By default 0.9 is preferred over 1.0.
+This can be changed by setting the bit 0x4 in the flags.
+.Pp
+Setting the bit 0x8 in the flags disables the 1.0 support completely.
 .Sh SEE ALSO
 .Xr intro 4
 .Sh HISTORY
diff --git a/sys/dev/fdt/virtio_mmio.c b/sys/dev/fdt/virtio_mmio.c
index a48091d7fca..3a74e647c19 100644
--- a/sys/dev/fdt/virtio_mmio.c
+++ b/sys/dev/fdt/virtio_mmio.c
@@ -89,7 +89,7 @@ void  virtio_mmio_write_device_config_2(struct 
virtio_softc *, int, uint16_t);
 void   virtio_mmio_write_device_config_4(struct virtio_softc *, int, 
uint32_t);
 void   virtio_mmio_write_device_config_8(struct virtio_softc *, int, 
uint64_t);
 uint16_t   virtio_mmio_read_queue_size(struct virtio_softc *, uint16_t);
-void   virtio_mmio_setup_queue(struct virtio_softc *, uint16_t, 
uint32_t);
+void   virtio_mmio_setup_queue(struct virtio_softc *, struct virtqueue 
*, uint64_t);
 void   virtio_mmio_set_status(struct virtio_softc *, int);
 uint32_t   virtio_mmio_negotiate_features(struct virtio_softc *, uint32_t,
  const struct virtio_feature_name 
*);
@@ -151,15 +151,18 @@ virtio_mmio_read_queue_size(struct virtio_softc *vsc, 
uint16_t idx)
 }
 
 void
-virtio_mmio_setup_queue(struct virtio_softc *vsc, uint16_t idx, uint32_t addr)
+virtio_mmio_setup_queue(struct virtio_softc *vsc, struct virtqueue *vq,
+uint64_t addr)
 {
struct virtio_mmio_softc *sc = (struct virtio_mmio_softc *)vsc;
-   bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_SEL, idx);
+   bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_SEL,
+   vq->vq_index);
bus_space_write_4(sc->sc_iot, sc->sc_ioh, VIRTIO_MMIO_QUEUE_NUM,
 

Add virtio 1.0 pci ids

2019-01-10 Thread Stefan Fritsch
Everything above 0x1040 is 1.x only.

Also tweak descriptoin of memory balloon device. There will be a memory, 
too.

OK?

diff --git a/sys/dev/pci/pcidevs b/sys/dev/pci/pcidevs
index 14943bb1350..72f95a29b00 100644
--- a/sys/dev/pci/pcidevs
+++ b/sys/dev/pci/pcidevs
@@ -6692,12 +6692,17 @@ product PERLE SPEED8_LE 0xb008  Speed8 LE
 /* Qumranet products */
 product QUMRANET VIO_NET   0x1000  Virtio Network
 product QUMRANET VIO_BLOCK 0x1001  Virtio Storage
-product QUMRANET VIO_MEM   0x1002  Virtio Memory
+product QUMRANET VIO_MEM   0x1002  Virtio Memory Balloon
 product QUMRANET VIO_CONS  0x1003  Virtio Console
 product QUMRANET VIO_SCSI  0x1004  Virtio SCSI
 product QUMRANET VIO_RNG   0x1005  Virtio RNG
-product QUMRANET VIO_GPU   0x1050  Virtio GPU
-product QUMRANET VIO_INPUT 0x1052  Virtio Input
+product QUMRANET VIO1_NET  0x1041  Virtio 1.x Network
+product QUMRANET VIO1_BLOCK0x1042  Virtio 1.x Storage
+product QUMRANET VIO1_CONS 0x1043  Virtio 1.x Console
+product QUMRANET VIO1_RNG  0x1044  Virtio 1.x RNG
+product QUMRANET VIO1_SCSI 0x1048  Virtio 1.x SCSI
+product QUMRANET VIO1_GPU  0x1050  Virtio 1.x GPU
+product QUMRANET VIO1_INPUT0x1052  Virtio 1.x Input
 
 /* Ross -> Pequr -> ServerWorks -> Broadcom ServerWorks products */
 product RCC CMIC_LE0x  CMIC-LE



Driver for accessing qemu fwcfg via hostctl

2019-01-08 Thread Stefan Fritsch
Hi,

I have written this driver that allows to access qemu's fwcfg key/value 
store with the hostctl tool. At the moment, it's usefulness to me is not 
very high because it uses acpi to detect the device, qemu only exposes the 
device in the acpi tables in the q35 machine type, but libvirt configures 
virtio 1.0 devices for q35 by default and openbsd does not yet support 
virtio 1.0. But if you start qemu without libvirt or do the required 
libvirt tweaks, you can use it.

Now, since vmd has gained fwcfg support, maybe it can be useful for 
someone? I guess one would need to add the non-acpi detection method 
(which I think reads from some known port and expects a 'qemu' response). 
Also, hostctl does not handle binary values at the moment.

Any opinions if this should go in now? If the alternative detection method 
would be added, where would be a good location for the driver file? 
Probably not acpi.

Cheers,
Stefan

Example usage:

# hostctl -f /dev/pvbus1 /   
bootorder
etc/acpi/rsdp
etc/acpi/tables
etc/boot-fail-wait
etc/e820
etc/msr_feature_control
etc/smbios/smbios-anchor
etc/smbios/smbios-tables
etc/smi/features-ok
etc/smi/requested-features
etc/smi/supported-features
etc/system-states
etc/table-loader
etc/tpm/log
genroms/kvmvapic.bin
opt/foobar
vgaroms/sgabios.bin


# hostctl -f /dev/pvbus1 opt/foobar
I am a string


diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index d204dd889c9..9e59fe9429f 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -25,8 +25,8 @@ MAN=  aac.4 ac97.4 acphy.4 acrtc.4 \
eap.4 ec.4 eephy.4 ef.4 eg.4 ehci.4 eisa.4 el.4 em.4 emc.4 gcu.4 \
emu.4 enc.4 endrun.4 envy.4 eoip.4 ep.4 epic.4 esa.4 \
eso.4 ess.4 et.4 etherip.4 etphy.4 ex.4 exphy.4 exrtc.4 \
-   fanpwr.4 fd.4 fdc.4 fec.4 fins.4 fintek.4 fms.4 fuse.4 fxp.4 gdt.4 \
-   gentbi.4 gem.4 gif.4 \
+   fanpwr.4 fd.4 fdc.4 fec.4 fins.4 fintek.4 fms.4 fuse.4 fwcfg.4 fxp.4 \
+   gdt.4 gentbi.4 gem.4 gif.4 \
glenv.4 gpio.4 gpiodcf.4 gpioiic.4 gpioow.4 gpr.4 gre.4 gscsio.4 \
hds.4 hiclock.4 hidwusb.4 hifn.4 hil.4 hilid.4 hilkbd.4 hilms.4 \
hireset.4 hitemp.4 hme.4 hotplug.4 hsq.4 \
diff --git a/share/man/man4/fwcfg.4 b/share/man/man4/fwcfg.4
new file mode 100644
index 000..b02e16e1725
--- /dev/null
+++ b/share/man/man4/fwcfg.4
@@ -0,0 +1,44 @@
+.\"$OpenBSD: $
+.\"
+.\" Copyright (c) 2018 Stefan Fritsch
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: $
+.Dt FWCFG 4
+.Os
+.Sh NAME
+.Nm fwcfg
+.Nd QEMU fw_cfg configuration interface
+.Sh SYNOPSIS
+.Cd "fwcfg0 at acpi?"
+.Sh DESCRIPTION
+.Nm
+provides access to the QEMU Firmware Configuration (fw_cfg) Device via the
+pvbus driver.
+Values can be read using
+.Xr hostctl 8
+.Sh SEE ALSO
+.Xr intro 4 ,
+.Xr pvbus 4 ,
+.Xr hostctl 8
+.Sh HISTORY
+The
+.Nm
+driver first appeared in
+.Ox 6.4 .
+.Sh AUTHORS
+The
+.Nm
+driver was written by
+.An Stefan Fritsch Aq Mt s...@openbsd.org .
diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC
index dd880f70213..1a06e93246d 100644
--- a/sys/arch/amd64/conf/GENERIC
+++ b/sys/arch/amd64/conf/GENERIC
@@ -88,6 +88,8 @@ hyperv0   at pvbus?   # Hyper-V guest
 hvn*   at hyperv?  # Hyper-V NetVSC
 hvs*   at hyperv?  # Hyper-V StorVSC
 
+fwcfg0 at acpi?# QEMU fw_cfg
+
 option PCIVERBOSE
 option USBVERBOSE
 
diff --git a/sys/dev/acpi/files.acpi b/sys/dev/acpi/files.acpi
index bd5c3e95268..6118852146e 100644
--- a/sys/dev/acpi/files.acpi
+++ b/sys/dev/acpi/files.acpi
@@ -193,3 +193,8 @@ filedev/acpi/acpisurface.c  acpisurface
 # IPMI
 attach ipmi at acpi with ipmi_acpi
 file   dev/acpi/ipmi_acpi.cipmi_acpi
+
+# QEMU fw_cfg device
+device fwcfg
+attach fwcfg at acpi
+file   dev/acpi/fwcfg.cfwcfg
diff --git a/sys/dev/acpi/fwcfg.c b/sys/dev/acpi/fwcfg.c
new file mode 100644
index 000..54a53f0210f
--- /dev/null
+++ b/sys/dev/acpi/fwcfg.c
@@ -0,0 +1,280 @@
+/* $OpenBSD: $ */
+/*
+ * Copyright (c) 2018 Stefan Fritsch 
+ *
+ * Permission to use, copy, modify,

Re: Looking for testers for em(4) quirks patch

2018-04-29 Thread Stefan Fritsch
On Monday, 2 April 2018 11:52:08 CEST Stefan Fritsch wrote:
> It would be nice if people could give it a try on various em(4) hardware
> to check if there are any regressions.

Somewhat belatedly, this is now committed. Thanks to all the people who tested 
it.

Cheers,
Stefan





Re: [PATCH 7/7] em: Add magic delay for HP elitebook 820 G3

2018-04-22 Thread Stefan Fritsch
On Tue, 10 Apr 2018, Jonathan Gray wrote:

> On Sat, Apr 07, 2018 at 01:35:31PM +0200, Stefan Fritsch wrote:
> > On Fri, 6 Apr 2018, Jonathan Gray wrote:
> > 
> > > On Thu, Apr 05, 2018 at 09:57:23PM +0200, Stefan Fritsch wrote:
> > > > Add another magic 1ms delay that seems to help with some remaining
> > > > issues on an HP elitebook 820 G3 with i219LM. A printf() at the same
> > > > place helps, too.
> > > 
> > > Could you explain what the problem here was and why this place was
> > > chosen to add the delay?
> > 
> > The problems fixed by patches 1-6 were that there were these kinds of 
> > errors
> > 
> > em0: Hardware Initialization Failed
> > em0: Unable to initialize the hardware
> > 
> > either during em_attach (in this case em0 would simply not be available in 
> > ifconfig) or during ifconfig up. The problems only appeared if there was 
> > no link (or no cable plugged in) during boot or during resume.
> > 
> > The fixes 1 to 5 made the problem appear much less often, but it still 
> > appeared sometimes. I had some printfs added and I have never seen the 
> > problem with the printf present. In the end I traced the "fix" to the 
> > single printf at this exact location and I replaced it with a delay.
> > 
> > We added Patch 6 (the E1000_TARC0_CB_MULTIQ_2_REQ erratum) later, to fix 
> > different problems with watchdog timeouts that were only recoverable by a 
> > reboot, when pulling the cable or when the link was flaky.
> 
> If you have the HP machine that triggers the problem it should be possible
> to find which register write the delay is needed after?

That should be possible, yes. I don't know when I will have the time for 
that, though. 

> 
> > 
> > > 
> > > > ---
> > > >  sys/dev/pci/if_em_hw.c | 2 ++
> > > >  1 file changed, 2 insertions(+)
> > > > 
> > > > diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
> > > > index 7709a4c5805..d122e727875 100644
> > > > --- sys/dev/pci/if_em_hw.c
> > > > +++ sys/dev/pci/if_em_hw.c
> > > > @@ -1493,6 +1493,8 @@ em_init_hw(struct em_hw *hw)
> > > > /* Set the media type and TBI compatibility */
> > > > em_set_media_type(hw);
> > > >  
> > > > +   /* Magic delay that improves problems with i219LM on HP 
> > > > Elitebook */
> > > > +   msec_delay(1);
> > > > /* Must be called after em_set_media_type because media_type is 
> > > > used */
> > > > em_initialize_hardware_bits(hw);
> > > >  
> > > > -- 
> > > > 2.13.0
> > > > 
> > > 
> > 
> 
> 



Re: [PATCH 4/7] em: Improve access logic for software flag

2018-04-22 Thread Stefan Fritsch
Sorry for the late response.

On Tue, 10 Apr 2018, Jonathan Gray wrote:

> On Thu, Apr 05, 2018 at 09:57:20PM +0200, Stefan Fritsch wrote:
> > Some em chips have a semaphore ("software flag") to synchronize access
> > to certain registers between OS and firmware (ME/AMT).
> > 
> > Make the logic to get the flag match the logic in freebsd. This includes
> > higher timeouts and waiting for a previous unlock to complete before
> > trying a lock again.
> 
> Shouldn't the printfs remain as DEBUGOUT() and DEBUGOUT2() as they are
> in FreeBSD?

I am pretty sure that things will break if the SW flag cannot be aquired 
and I prefer to have meaningful error messages in that case. But I am fine 
either way.

> > ---
> >  sys/dev/pci/if_em_hw.c | 22 +++---
> >  sys/dev/pci/if_em_hw.h |  2 ++
> >  2 files changed, 21 insertions(+), 3 deletions(-)
> > 
> > diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
> > index e5084252c29..5bba83cbcd4 100644
> > --- sys/dev/pci/if_em_hw.c
> > +++ sys/dev/pci/if_em_hw.c
> > @@ -9613,9 +9613,21 @@ em_get_software_flag(struct em_hw *hw)
> > if (IS_ICH8(hw->mac_type)) {
> > while (timeout) {
> > extcnf_ctrl = E1000_READ_REG(hw, EXTCNF_CTRL);
> > -   extcnf_ctrl |= E1000_EXTCNF_CTRL_SWFLAG;
> > -   E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
> > +   if (!(extcnf_ctrl & E1000_EXTCNF_CTRL_SWFLAG))
> > +   break;
> > +   msec_delay_irq(1);
> > +   timeout--;
> > +   }
> > +   if (!timeout) {
> > +   printf("%s: SW has already locked the resource?\n",
> > +   __func__);
> > +   return -E1000_ERR_CONFIG;
> > +   }
> > +   timeout = SW_FLAG_TIMEOUT;
> > +   extcnf_ctrl |= E1000_EXTCNF_CTRL_SWFLAG;
> > +   E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
> >  
> > +   while (timeout) {
> > extcnf_ctrl = E1000_READ_REG(hw, EXTCNF_CTRL);
> > if (extcnf_ctrl & E1000_EXTCNF_CTRL_SWFLAG)
> > break;
> > @@ -9624,7 +9636,11 @@ em_get_software_flag(struct em_hw *hw)
> > }
> >  
> > if (!timeout) {
> > -   DEBUGOUT("FW or HW locks the resource too long.\n");
> > +   printf("Failed to acquire the semaphore, FW or HW "
> > +   "has it: FWSM=0x%8.8x EXTCNF_CTRL=0x%8.8x)\n",
> > +   E1000_READ_REG(hw, FWSM), extcnf_ctrl);
> > +   extcnf_ctrl &= ~E1000_EXTCNF_CTRL_SWFLAG;
> > +   E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
> > return -E1000_ERR_CONFIG;
> > }
> > }
> > diff --git sys/dev/pci/if_em_hw.h sys/dev/pci/if_em_hw.h
> > index 5e415e60a34..71dc91e5582 100644
> > --- sys/dev/pci/if_em_hw.h
> > +++ sys/dev/pci/if_em_hw.h
> > @@ -2755,6 +2755,8 @@ struct em_host_command_info {
> >  #define AUTO_READ_DONE_TIMEOUT  10
> >  /* Number of milliseconds we wait for PHY configuration done after MAC 
> > reset */
> >  #define PHY_CFG_TIMEOUT 100
> > +/* SW Semaphore flag timeout in ms */
> > +#define SW_FLAG_TIMEOUT1000
> >  
> >  #define E1000_TX_BUFFER_SIZE ((uint32_t)1514)
> >  
> > -- 
> > 2.13.0
> > 
> 
> 



Re: [PATCH 1/7] em: Print error code and phy/mac type

2018-04-07 Thread Stefan Fritsch
On Fri, 6 Apr 2018, Jonathan Gray wrote:

> On Thu, Apr 05, 2018 at 09:57:17PM +0200, Stefan Fritsch wrote:
> > Print the error code if hardware initialization failed.
> > 
> > If EM_DEBUG is defined, print the phy/mac type during attach.
> 
> ok jsg@
> 
> Though these are just enum values we assign that don't come from the
> hardware.
> 
> mac_type is set based on pci product and in some cases revision.
> phy_type is mostly set based on hw->phy_id from PHY_ID1/PHY_ID2.

My workflow was looking at the code path through the em driver for this 
hardware, and then look at linux/freebsd what they do differently for the 
same hardware. Having these two values was necessary for that.



Re: [PATCH 7/7] em: Add magic delay for HP elitebook 820 G3

2018-04-07 Thread Stefan Fritsch
On Fri, 6 Apr 2018, Jonathan Gray wrote:

> On Thu, Apr 05, 2018 at 09:57:23PM +0200, Stefan Fritsch wrote:
> > Add another magic 1ms delay that seems to help with some remaining
> > issues on an HP elitebook 820 G3 with i219LM. A printf() at the same
> > place helps, too.
> 
> Could you explain what the problem here was and why this place was
> chosen to add the delay?

The problems fixed by patches 1-6 were that there were these kinds of 
errors

em0: Hardware Initialization Failed
em0: Unable to initialize the hardware

either during em_attach (in this case em0 would simply not be available in 
ifconfig) or during ifconfig up. The problems only appeared if there was 
no link (or no cable plugged in) during boot or during resume.

The fixes 1 to 5 made the problem appear much less often, but it still 
appeared sometimes. I had some printfs added and I have never seen the 
problem with the printf present. In the end I traced the "fix" to the 
single printf at this exact location and I replaced it with a delay.

We added Patch 6 (the E1000_TARC0_CB_MULTIQ_2_REQ erratum) later, to fix 
different problems with watchdog timeouts that were only recoverable by a 
reboot, when pulling the cable or when the link was flaky.

> 
> > ---
> >  sys/dev/pci/if_em_hw.c | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
> > index 7709a4c5805..d122e727875 100644
> > --- sys/dev/pci/if_em_hw.c
> > +++ sys/dev/pci/if_em_hw.c
> > @@ -1493,6 +1493,8 @@ em_init_hw(struct em_hw *hw)
> > /* Set the media type and TBI compatibility */
> > em_set_media_type(hw);
> >  
> > +   /* Magic delay that improves problems with i219LM on HP Elitebook */
> > +   msec_delay(1);
> > /* Must be called after em_set_media_type because media_type is used */
> > em_initialize_hardware_bits(hw);
> >  
> > -- 
> > 2.13.0
> > 
> 



[PATCH 1/7] em: Print error code and phy/mac type

2018-04-05 Thread Stefan Fritsch
Print the error code if hardware initialization failed.

If EM_DEBUG is defined, print the phy/mac type during attach.
---
 sys/dev/pci/if_em.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git sys/dev/pci/if_em.c sys/dev/pci/if_em.c
index ec8e35245ef..30df846117c 100644
--- sys/dev/pci/if_em.c
+++ sys/dev/pci/if_em.c
@@ -557,6 +557,9 @@ em_attach(struct device *parent, struct device *self, void 
*aux)
if (!defer)
em_update_link_status(sc);
 
+#ifdef EM_DEBUG
+   printf(", mac %#x phy %#x", sc->hw.mac_type, sc->hw.phy_type);
+#endif
printf(", address %s\n", ether_sprintf(sc->sc_ac.ac_enaddr));
 
/* Indicate SOL/IDER usage */
@@ -1860,8 +1863,8 @@ em_hardware_init(struct em_softc *sc)
INIT_DEBUGOUT("\nHardware Initialization Deferred ");
return (EAGAIN);
}
-   printf("\n%s: Hardware Initialization Failed\n",
-  DEVNAME(sc));
+   printf("\n%s: Hardware Initialization Failed: %d\n",
+  DEVNAME(sc), ret_val);
return (EIO);
}
 
-- 
2.13.0



[PATCH 6/7] em: Port an i219 errata workaround from FreeBSD

2018-04-05 Thread Stefan Fritsch
https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/i218-i219-ethernet-connection-spec-update.pdf?asset=9561
---
 sys/dev/pci/if_em.c| 4 +++-
 sys/dev/pci/if_em_hw.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git sys/dev/pci/if_em.c sys/dev/pci/if_em.c
index 30df846117c..233849f536b 100644
--- sys/dev/pci/if_em.c
+++ sys/dev/pci/if_em.c
@@ -2268,7 +2268,9 @@ em_initialize_transmit_unit(struct em_softc *sc)
EM_WRITE_REG(>hw, E1000_IOSFPC, reg_val);
 
reg_val = E1000_READ_REG(>hw, TARC0);
-   reg_val |= E1000_TARC0_CB_MULTIQ_3_REQ;
+   /* i218-i219 Specification Update 1.5.4.5 */
+reg_val &= ~E1000_TARC0_CB_MULTIQ_3_REQ;
+reg_val |= E1000_TARC0_CB_MULTIQ_2_REQ;
E1000_WRITE_REG(>hw, TARC0, reg_val);
}
 }
diff --git sys/dev/pci/if_em_hw.h sys/dev/pci/if_em_hw.h
index 91993a6eb66..9c2cfe97569 100644
--- sys/dev/pci/if_em_hw.h
+++ sys/dev/pci/if_em_hw.h
@@ -2296,6 +2296,7 @@ struct em_hw {
 #define E1000_WUS_FLX_FILTERS 0x000F /* Mask for the 4 flexible filters */
 
 /* TRAC0 bits */
+#define E1000_TARC0_CB_MULTIQ_2_REQ (1 << 29)
 #define E1000_TARC0_CB_MULTIQ_3_REQ (1 << 28 | 1 << 29)
 
 /* Management Control */
-- 
2.13.0



[PATCH 3/7] em: Add em_check_phy_reset_block() quirk

2018-04-05 Thread Stefan Fritsch
Port the logic from freebsd to em_check_phy_reset_block(). A single
read does not seem to be reliable.
---
 sys/dev/pci/if_em_hw.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
index 52d3ee95d15..e5084252c29 100644
--- sys/dev/pci/if_em_hw.c
+++ sys/dev/pci/if_em_hw.c
@@ -9539,9 +9539,18 @@ em_check_phy_reset_block(struct em_hw *hw)
DEBUGFUNC("em_check_phy_reset_block\n");
 
if (IS_ICH8(hw->mac_type)) {
-   fwsm = E1000_READ_REG(hw, FWSM);
-   return (fwsm & E1000_FWSM_RSPCIPHY) ? E1000_SUCCESS :
-   E1000_BLK_PHY_RESET;
+   int i = 0;
+   int blocked = 0;
+   do {
+   fwsm = E1000_READ_REG(hw, FWSM);
+   if (!(fwsm & E1000_FWSM_RSPCIPHY)) {
+   blocked = 1;
+   msec_delay(10);
+   continue;
+   }
+   blocked = 0;
+   } while (blocked && (i++ < 30));
+   return blocked ? E1000_BLK_PHY_RESET : E1000_SUCCESS;
}
if (hw->mac_type > em_82547_rev_2)
manc = E1000_READ_REG(hw, MANC);
-- 
2.13.0



[PATCH 4/7] em: Improve access logic for software flag

2018-04-05 Thread Stefan Fritsch
Some em chips have a semaphore ("software flag") to synchronize access
to certain registers between OS and firmware (ME/AMT).

Make the logic to get the flag match the logic in freebsd. This includes
higher timeouts and waiting for a previous unlock to complete before
trying a lock again.
---
 sys/dev/pci/if_em_hw.c | 22 +++---
 sys/dev/pci/if_em_hw.h |  2 ++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
index e5084252c29..5bba83cbcd4 100644
--- sys/dev/pci/if_em_hw.c
+++ sys/dev/pci/if_em_hw.c
@@ -9613,9 +9613,21 @@ em_get_software_flag(struct em_hw *hw)
if (IS_ICH8(hw->mac_type)) {
while (timeout) {
extcnf_ctrl = E1000_READ_REG(hw, EXTCNF_CTRL);
-   extcnf_ctrl |= E1000_EXTCNF_CTRL_SWFLAG;
-   E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
+   if (!(extcnf_ctrl & E1000_EXTCNF_CTRL_SWFLAG))
+   break;
+   msec_delay_irq(1);
+   timeout--;
+   }
+   if (!timeout) {
+   printf("%s: SW has already locked the resource?\n",
+   __func__);
+   return -E1000_ERR_CONFIG;
+   }
+   timeout = SW_FLAG_TIMEOUT;
+   extcnf_ctrl |= E1000_EXTCNF_CTRL_SWFLAG;
+   E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
 
+   while (timeout) {
extcnf_ctrl = E1000_READ_REG(hw, EXTCNF_CTRL);
if (extcnf_ctrl & E1000_EXTCNF_CTRL_SWFLAG)
break;
@@ -9624,7 +9636,11 @@ em_get_software_flag(struct em_hw *hw)
}
 
if (!timeout) {
-   DEBUGOUT("FW or HW locks the resource too long.\n");
+   printf("Failed to acquire the semaphore, FW or HW "
+   "has it: FWSM=0x%8.8x EXTCNF_CTRL=0x%8.8x)\n",
+   E1000_READ_REG(hw, FWSM), extcnf_ctrl);
+   extcnf_ctrl &= ~E1000_EXTCNF_CTRL_SWFLAG;
+   E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
return -E1000_ERR_CONFIG;
}
}
diff --git sys/dev/pci/if_em_hw.h sys/dev/pci/if_em_hw.h
index 5e415e60a34..71dc91e5582 100644
--- sys/dev/pci/if_em_hw.h
+++ sys/dev/pci/if_em_hw.h
@@ -2755,6 +2755,8 @@ struct em_host_command_info {
 #define AUTO_READ_DONE_TIMEOUT  10
 /* Number of milliseconds we wait for PHY configuration done after MAC reset */
 #define PHY_CFG_TIMEOUT 100
+/* SW Semaphore flag timeout in ms */
+#define SW_FLAG_TIMEOUT1000
 
 #define E1000_TX_BUFFER_SIZE ((uint32_t)1514)
 
-- 
2.13.0



[PATCH 7/7] em: Add magic delay for HP elitebook 820 G3

2018-04-05 Thread Stefan Fritsch
Add another magic 1ms delay that seems to help with some remaining
issues on an HP elitebook 820 G3 with i219LM. A printf() at the same
place helps, too.
---
 sys/dev/pci/if_em_hw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
index 7709a4c5805..d122e727875 100644
--- sys/dev/pci/if_em_hw.c
+++ sys/dev/pci/if_em_hw.c
@@ -1493,6 +1493,8 @@ em_init_hw(struct em_hw *hw)
/* Set the media type and TBI compatibility */
em_set_media_type(hw);
 
+   /* Magic delay that improves problems with i219LM on HP Elitebook */
+   msec_delay(1);
/* Must be called after em_set_media_type because media_type is used */
em_initialize_hardware_bits(hw);
 
-- 
2.13.0



[PATCH 5/7] em: Make em_get_software_flag() recursive

2018-04-05 Thread Stefan Fritsch
The em driver calls em_get_software_flag() recursively, which causes the
semaphore to be unlocked too early. Make em_get_software_flag and
em_release_software_flag handle this correctly.  Freebsd does not do
this, but they have a mutex that probably allows them to detect
recursive calls to e1000_acquire_swflag_ich8lan().  Reworking the
openbsd driver to not recursively get the semaphore would be very
invasive.
---
 sys/dev/pci/if_em_hw.c | 14 ++
 sys/dev/pci/if_em_hw.h |  1 +
 2 files changed, 15 insertions(+)

diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
index 5bba83cbcd4..7709a4c5805 100644
--- sys/dev/pci/if_em_hw.c
+++ sys/dev/pci/if_em_hw.c
@@ -945,6 +945,8 @@ em_reset_hw(struct em_hw *hw)
}
em_get_software_flag(hw);
E1000_WRITE_REG(hw, CTRL, (ctrl | E1000_CTRL_RST));
+   /* HW reset releases software_flag */
+   hw->sw_flag = 0;
msec_delay(20);
 
/* Ungate automatic PHY configuration on non-managed 82579 */
@@ -9611,6 +9613,10 @@ em_get_software_flag(struct em_hw *hw)
DEBUGFUNC("em_get_software_flag");
 
if (IS_ICH8(hw->mac_type)) {
+   if (hw->sw_flag) {
+   hw->sw_flag++;
+   return E1000_SUCCESS;
+   }
while (timeout) {
extcnf_ctrl = E1000_READ_REG(hw, EXTCNF_CTRL);
if (!(extcnf_ctrl & E1000_EXTCNF_CTRL_SWFLAG))
@@ -9644,6 +9650,7 @@ em_get_software_flag(struct em_hw *hw)
return -E1000_ERR_CONFIG;
}
}
+   hw->sw_flag++;
return E1000_SUCCESS;
 }
 
@@ -9663,6 +9670,13 @@ em_release_software_flag(struct em_hw *hw)
DEBUGFUNC("em_release_software_flag");
 
if (IS_ICH8(hw->mac_type)) {
+   if (hw->sw_flag <= 0) {
+   printf("%s: not locked!\n", __func__);
+   return;
+   }
+   hw->sw_flag--;
+   if (hw->sw_flag > 0)
+   return;
extcnf_ctrl = E1000_READ_REG(hw, EXTCNF_CTRL);
extcnf_ctrl &= ~E1000_EXTCNF_CTRL_SWFLAG;
E1000_WRITE_REG(hw, EXTCNF_CTRL, extcnf_ctrl);
diff --git sys/dev/pci/if_em_hw.h sys/dev/pci/if_em_hw.h
index 71dc91e5582..91993a6eb66 100644
--- sys/dev/pci/if_em_hw.h
+++ sys/dev/pci/if_em_hw.h
@@ -1634,6 +1634,7 @@ struct em_hw {
 uint8_t bus_func;
 uint16_t swfw;
 boolean_t eee_enable;
+int sw_flag;
 };
 
 #define E1000_EEPROM_SWDPIN0   0x0001   /* SWDPIN 0 EEPROM Value */
-- 
2.13.0



[PATCH 2/7] em: Increase delay after reset to 20ms

2018-04-05 Thread Stefan Fritsch
This is the value in freebsd for ich8lan.
---
 sys/dev/pci/if_em_hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
index df0fa571736..52d3ee95d15 100644
--- sys/dev/pci/if_em_hw.c
+++ sys/dev/pci/if_em_hw.c
@@ -945,7 +945,7 @@ em_reset_hw(struct em_hw *hw)
}
em_get_software_flag(hw);
E1000_WRITE_REG(hw, CTRL, (ctrl | E1000_CTRL_RST));
-   msec_delay(5);
+   msec_delay(20);
 
/* Ungate automatic PHY configuration on non-managed 82579 */
if (hw->mac_type == em_pch2lan && !hw->phy_reset_disable &&
-- 
2.13.0



Re: Looking for testers for em(4) quirks patch

2018-04-05 Thread Stefan Fritsch
On Thu, 5 Apr 2018, Mark Kettenis wrote:

> > Date: Thu, 5 Apr 2018 02:02:20 +1000
> > From: Jonathan Gray <j...@jsg.id.au>
> > 
> > On Mon, Apr 02, 2018 at 11:52:08AM +0200, Stefan Fritsch wrote:
> > > Hi,
> > > 
> > > We have seen problems with em on i219V and i219LM. For example, "Hardware 
> > > Initialization Failed" if no cable is plugged in during boot, or watchdog 
> > > timeouts / hangs until next boot if the cable is removed while data is 
> > > transmitted.
> > > 
> > > This patch adds a bunch of quirks and fixes from freebsd.
> > 
> > Can you split this into a few patches?  It would be easier to review/backout
> > individual parts if needed.

I can do that. Will take a bit, though.

> > 
> > Ie I wonder if the sw_flag parts could be done differently but other
> > parts could go in without that.

In the long term, it may be a good idea to scrap all files except if_em.c, 
write an openbsd specific e1000_osdep.c, and import all other files from 
freebsd. This would be similar to what is done for drm, though hopefully 
importing new versions of e1000 would be much less work than importing new 
versions of drm.

I see no other sane way to keep track of all the hardware quirks. But I 
don't have time for that in the near future.

> > printf format string could change from ", mac_type %#x phy_type %#x"
> > to ", mac %#x phy %#x" though I'm not sold on the value of printing
> > it to begin with.  The PCH ems all have a single PHY, MAC can be
> > inferred from pcidump.

The question is how many intel PDFs do you need to read to get from 
pcidump to phy/mac type.

> Must admit that it makes me a bit sad to print this as well.  Maybe
> wrap this in #ifdef DEBUG such that it is easy to enable for people
> when requested.

Will do that.

Cheers,
Stefan



Looking for testers for em(4) quirks patch

2018-04-02 Thread Stefan Fritsch
Hi,

We have seen problems with em on i219V and i219LM. For example, "Hardware 
Initialization Failed" if no cable is plugged in during boot, or watchdog 
timeouts / hangs until next boot if the cable is removed while data is 
transmitted.

This patch adds a bunch of quirks and fixes from freebsd.

It would be nice if people could give it a try on various em(4) hardware 
to check if there are any regressions.

Comments on the patch are of course welcome, too.

Cheers,
Stefan

* Some em chips have a semaphore ("software flag") to synchronize access
  to certain registers between OS and firmware (ME/AMT).

  Make the logic to get the flag match the logic in freebsd. This includes
  higher timeouts and waiting for a previous unlock to complete before
  trying a lock again.

* Another problem was that openbsd em driver calls em_get_software_flag
  recursively, which causes the semaphore to be unlocked too early. Make
  em_get_software_flag/em_release_software_flag handle this correctly.
  Freebsd does not do this, but they have a mutex that probably allows
  them to detect recursive calls to e1000_acquire_swflag_ich8lan().
  Reworking the openbsd driver to not recursively get the semaphore would
  be very invasive.

* Port the logic from freebsd to em_check_phy_reset_block(). A single
  read does not seem to be reliable.

* Also, increase delay after reset to 20ms, which is the value in freebsd
  for ich8lan.

* Add another magic 1ms delay that seems to help with some remaining issues
  on an HP elitebook 820 G3. A printf() at the same place helps, too.

* Port a silicon errata workaround from FreeBSD.

  
https://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/i218-i219-ethernet-connection-spec-update.pdf?asset=9561

* While there, print mac+phy type in em_attach(). This makes it easier to
  determine which quirks are hit/missing when comparing to freebsd.

* Also print em_init_hw() error code if something goes wrong.


diff --git sys/dev/pci/if_em.c sys/dev/pci/if_em.c
index ec8e35245ef..f6a1f1c3894 100644
--- sys/dev/pci/if_em.c
+++ sys/dev/pci/if_em.c
@@ -557,6 +557,8 @@ em_attach(struct device *parent, struct device *self, void 
*aux)
if (!defer)
em_update_link_status(sc);
 
+   printf(", mac_type %#x phy_type %#x", sc->hw.mac_type,
+   sc->hw.phy_type);
printf(", address %s\n", ether_sprintf(sc->sc_ac.ac_enaddr));
 
/* Indicate SOL/IDER usage */
@@ -1860,8 +1862,8 @@ em_hardware_init(struct em_softc *sc)
INIT_DEBUGOUT("\nHardware Initialization Deferred ");
return (EAGAIN);
}
-   printf("\n%s: Hardware Initialization Failed\n",
-  DEVNAME(sc));
+   printf("\n%s: Hardware Initialization Failed: %d\n",
+  DEVNAME(sc), ret_val);
return (EIO);
}
 
@@ -2265,7 +2267,9 @@ em_initialize_transmit_unit(struct em_softc *sc)
EM_WRITE_REG(>hw, E1000_IOSFPC, reg_val);
 
reg_val = E1000_READ_REG(>hw, TARC0);
-   reg_val |= E1000_TARC0_CB_MULTIQ_3_REQ;
+   /* i218-i219 Specification Update 1.5.4.5 */
+reg_val &= ~E1000_TARC0_CB_MULTIQ_3_REQ;
+reg_val |= E1000_TARC0_CB_MULTIQ_2_REQ;
E1000_WRITE_REG(>hw, TARC0, reg_val);
}
 }
diff --git sys/dev/pci/if_em_hw.c sys/dev/pci/if_em_hw.c
index df0fa571736..d122e727875 100644
--- sys/dev/pci/if_em_hw.c
+++ sys/dev/pci/if_em_hw.c
@@ -945,7 +945,9 @@ em_reset_hw(struct em_hw *hw)
}
em_get_software_flag(hw);
E1000_WRITE_REG(hw, CTRL, (ctrl | E1000_CTRL_RST));
-   msec_delay(5);
+   /* HW reset releases software_flag */
+   hw->sw_flag = 0;
+   msec_delay(20);
 
/* Ungate automatic PHY configuration on non-managed 82579 */
if (hw->mac_type == em_pch2lan && !hw->phy_reset_disable &&
@@ -1491,6 +1493,8 @@ em_init_hw(struct em_hw *hw)
/* Set the media type and TBI compatibility */
em_set_media_type(hw);
 
+   /* Magic delay that improves problems with i219LM on HP Elitebook */
+   msec_delay(1);
/* Must be called after em_set_media_type because media_type is used */
em_initialize_hardware_bits(hw);
 
@@ -9539,9 +9543,18 @@ em_check_phy_reset_block(struct em_hw *hw)
DEBUGFUNC("em_check_phy_reset_block\n");
 
if (IS_ICH8(hw->mac_type)) {
-   fwsm = E1000_READ_REG(hw, FWSM);
-   return (fwsm & E1000_FWSM_RSPCIPHY) ? E1000_SUCCESS :
-   E1000_BLK_PHY_RESET;
+   int i = 0;
+   int blocked = 0;
+   do {
+   fwsm = E1000_READ_REG(hw, FWSM);
+   if (!(fwsm & E1000_FWSM_RSPCIPHY)) {
+   blocked = 1;
+   

More docs for buffer cache, block sizes, etc.

2017-08-15 Thread Stefan Fritsch
Hi,

here are some comments / man page updates for things I have learned in my 
adventures through msdosfs and vfs_bio.c. I have also added the 
bread_cluster function to buffercache(9). It would be nice if someone who 
knows the buffer/disk stuff could review this.

Hint for the paragraph on block sizes: In ufs/ffs/ffs_vfsops.c there is:

sbp->f_iosize = fs->fs_bsize;

Then look at the use of fs_bsize in ufs/ffs/fs.h

Cheers,
Stefan

diff --git a/share/man/man9/buffercache.9 b/share/man/man9/buffercache.9
index 84df0c06513..0c4c8f1ad96 100644
--- a/share/man/man9/buffercache.9
+++ b/share/man/man9/buffercache.9
@@ -108,6 +108,7 @@
 .Sh NAME
 .Nm buffercache ,
 .Nm bread ,
+.Nm bread_cluster ,
 .Nm breadn ,
 .Nm bwrite ,
 .Nm bawrite ,
@@ -126,6 +127,9 @@
 .Fn bread "struct vnode *vp" "daddr_t blkno" "int size" \
 "struct buf **bpp"
 .Ft int
+.Fn bread_cluster "struct vnode *vp" "daddr_t blkno" "int size" \
+"struct buf **bpp"
+.Ft int
 .Fn breadn "struct vnode *vp" "daddr_t blkno" "int size" \
 "daddr_t rablks[]" "int rasizes[]" "int nrablks" \
 "struct buf **bpp"
@@ -163,6 +167,11 @@ In addition to describing a cached block, a
 .Em buf
 structure is also used to describe an I/O request as a part of
 the disk driver interface.
+.Pp
+The block size used for logical block numbers depends on the type of the
+given vnode.
+For file vnodes, this is f_iosize of the underlying filesystem.
+For block device vnodes, this will usually be DEV_BSIZE.
 .\" XXX struct buf, B_ flags, MP locks, etc.
 .\" XXX free list, hash queue, etc.
 .\" 
@@ -184,6 +193,10 @@ to allocate a buffer with enough pages for
 .Fa size
 and reads the specified disk block into it.
 .Pp
+.Fn bread
+always returns a buffer, even if it returns an error due to an I/O
+error.
+.Pp
 The buffer returned by
 .Fn bread
 is marked as busy.
@@ -222,6 +235,30 @@ and
 The read-ahead blocks aren't returned, but are available in cache for
 future accesses.
 .\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+.It Xo
+.Fo bread_cluster
+.Fa "vp"
+.Fa "blkno"
+.Fa "size"
+.Fa "bpp"
+.Fc
+.Xc
+Read a block of size
+.Fa "size"
+corresponding to
+.Fa vp
+and
+.Fa blkno ,
+with readahead.
+If neither the first block, nor a part of the next MAXBSIZE bytes is already
+in the buffer cache,
+.Fn bread_cluster
+will perform a read-ahead of MAXBSIZE bytes in a single I/O operation.
+This is currently more efficient than
+.Fn breadn .
+The read-ahead data isn't returned, but is available in cache for
+future accesses.
+.\" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 .It Fn bwrite "bp"
 Write a block.
 Start I/O for write using
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 88adfeff237..4804d370f49 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -568,6 +568,12 @@ bread_cluster_callback(struct buf *bp)
}
 }
 
+/*
+ * Read-ahead multiple disk blocks, but make sure only one (big) I/O
+ * request is sent to the disk.
+ * XXX This should probably be dropped and breadn should instead be optimized
+ * XXX to do fewer I/O requests.
+ */
 int
 bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp)
 {
@@ -1023,6 +1029,9 @@ geteblk(size_t size)
 
 /*
  * Allocate a buffer.
+ * If vp is given, put it into the buffer cache for that vnode.
+ * If size != 0, allocate memory and call buf_map().
+ * If there is already a buffer for the given vnode/blkno, return NULL.
  */
 struct buf *
 buf_get(struct vnode *vp, daddr_t blkno, size_t size)



Skipping amd errata on hypervisors?

2017-08-15 Thread Stefan Fritsch
I have got a report that openbsd panics on boot with qemu -cpu Opteron_G3 
(but Opteron_G2 works).

kernel: protection fault trap, code=0
Stopped at  amd64_errata_setmsr+0x14:   rdmsr
ddb{0}> >> OpenBSD/amd64 BOOT 3.33
boot>

Qemu does not implement all the secret errata MSRs. Does it make sense to 
simply skip all errata processing if we detect a hypervisor?


diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c
index a448b885ba7..371c0c8ff48 100644
--- a/sys/arch/amd64/amd64/identcpu.c
+++ b/sys/arch/amd64/amd64/identcpu.c
@@ -708,7 +708,7 @@ identifycpu(struct cpu_info *ci)
}
 #endif
 
-   if (!strcmp(cpu_vendor, "AuthenticAMD"))
+   if (!strcmp(cpu_vendor, "AuthenticAMD") && !ISSET(cpu_ecxfeature, 
CPUIDECX_HV))
amd64_errata(ci);
 
if (CPU_IS_PRIMARY(ci) && !strcmp(cpu_vendor, "CentaurHauls")) {



Implement VFS read clustering for MSDOSFS, third try

2017-08-15 Thread Stefan Fritsch
Hi,

this is another try at making read clustering work for msdosfs.

Last year, mpi@ implemented VFS read clustering for MSDOSFS in

sys/msdosfs/denode.h 1.28
sys/msdosfs/msdosfs_vnops.c 1.105

This caused regressions when doing seeks past the end of the file and had.
to be reverted. Then I tried to fix that in

denode.h,v 1.31
msdosfs_vnops.c,v 1.114 

But that caused different problems and had to be reverted, too. I have 
another test for that that I will commit soon.

It seems that while netbsd uses DEV_BSIZE as base for the logical block 
numbers even in files, openbsd is more like freebsd and uses 
mnt_stat.f_iosize in that case. However, if the file system does accesses 
for metadata on the disk, it has to use DEV_BSIZE for accesses. So the 
fixes in 1.31/1.114 were wrong and the following diff is again based on 
1.28/1.105 with some fixes in extendfile() to make seeks past the end of 
the file work.


Testers/reviews/OKs welcome. Especially people who need msdosfs for 
booting via uefi. In any case, I won't commit until I get home again in a 
week.

If my analysis with the block sizes is correct, I will add some 
clarifications to the bread/bwrite man page later.

Cheers,
Stefan


diff --git a/sys/msdosfs/denode.h b/sys/msdosfs/denode.h
index 38ae2027a63..4e521c6d5b8 100644
--- a/sys/msdosfs/denode.h
+++ b/sys/msdosfs/denode.h
@@ -142,7 +142,6 @@ struct denode {
struct vnode *de_devvp; /* vnode of blk dev we live on */
uint32_t de_flag;   /* flag bits */
dev_t de_dev;   /* device where direntry lives */
-   daddr_t de_lastr;
uint32_t de_dirclust;   /* cluster of the directory file containing 
this entry */
uint32_t de_diroffset;  /* offset of this entry in the directory 
cluster */
uint32_t de_fndoffset;  /* offset of found dir entry */
diff --git a/sys/msdosfs/msdosfs_fat.c b/sys/msdosfs/msdosfs_fat.c
index 9e4675fa7e5..bee7c07f4d7 100644
--- a/sys/msdosfs/msdosfs_fat.c
+++ b/sys/msdosfs/msdosfs_fat.c
@@ -1021,14 +1021,12 @@ extendfile(struct denode *dep, uint32_t count, struct 
buf **bpp, uint32_t *ncp,
bp = getblk(pmp->pm_devvp, cntobn(pmp, 
cn++),
pmp->pm_bpcluster, 0, 0);
else {
-   bp = getblk(DETOV(dep), de_cn2bn(pmp, 
frcn++),
+   bp = getblk(DETOV(dep), frcn++,
pmp->pm_bpcluster, 0, 0);
/*
 * Do the bmap now, as in msdosfs_write
 */
-   if (pcbmap(dep,
-   de_bn2cn(pmp, bp->b_lblkno),
-   >b_blkno, 0, 0))
+   if (pcbmap(dep, bp->b_lblkno, 
>b_blkno, 0, 0))
bp->b_blkno = -1;
if (bp->b_blkno == -1)
panic("extendfile: pcbmap");
diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c
index 8452e9be80c..cadaf78920d 100644
--- a/sys/msdosfs/msdosfs_vnops.c
+++ b/sys/msdosfs/msdosfs_vnops.c
@@ -77,13 +77,13 @@
 
 static uint32_t fileidhash(uint64_t);
 
+int msdosfs_bmaparray(struct vnode *, daddr_t, daddr_t *, int *);
 int msdosfs_kqfilter(void *);
 int filt_msdosfsread(struct knote *, long);
 int filt_msdosfswrite(struct knote *, long);
 int filt_msdosfsvnode(struct knote *, long);
 void filt_msdosfsdetach(struct knote *);
 
-
 /*
  * Some general notes:
  *
@@ -509,18 +509,14 @@ int
 msdosfs_read(void *v)
 {
struct vop_read_args *ap = v;
-   int error = 0;
-   uint32_t diff;
-   int blsize;
-   int isadir;
-   uint32_t n;
-   long on;
-   daddr_t lbn, rablock, rablkno;
-   struct buf *bp;
struct vnode *vp = ap->a_vp;
struct denode *dep = VTODE(vp);
struct msdosfsmount *pmp = dep->de_pmp;
struct uio *uio = ap->a_uio;
+   int isadir, error = 0;
+   uint32_t n, diff, size, on;
+   struct buf *bp;
+   daddr_t cn, bn;
 
/*
 * If they didn't ask for any data, then we are done.
@@ -535,7 +531,8 @@ msdosfs_read(void *v)
if (uio->uio_offset >= dep->de_FileSize)
return (0);
 
-   lbn = de_cluster(pmp, uio->uio_offset);
+   cn = de_cluster(pmp, uio->uio_offset);
+   size = pmp->pm_bpcluster;
on = uio->uio_offset & pmp->pm_crbomask;
n = ulmin(pmp->pm_bpcluster - on, uio->uio_resid);
 
@@ -548,31 +545,23 @@ msdosfs_read(void *v)
if (diff < n)
n = diff;
 
-   /* convert cluster # to block # if a directory */
-   if 

minor msdosfs tweaks

2017-08-13 Thread Stefan Fritsch
add to comments for pcbmap()
remove useless ";"

OK?

diff --git a/sys/msdosfs/msdosfs_fat.c b/sys/msdosfs/msdosfs_fat.c
index 3a3d4de0761..aeb822d4be2 100644
--- a/sys/msdosfs/msdosfs_fat.c
+++ b/sys/msdosfs/msdosfs_fat.c
@@ -123,6 +123,8 @@ fatblock(struct msdosfsmount *pmp, uint32_t ofs, uint32_t 
*bnp, uint32_t *sizep,
  * If this pointer is null then don't return this quantity.
  * cnp   - address of where to place the file system relative cluster number.
  * If this pointer is null then don't return this quantity.
+ * sp- address of where to place the block size for the file/dir
+ * If this pointer is null then don't return this quantity.
  *
  * NOTE: Either bnp or cnp must be non-null.
  * This function has one side effect.  If the requested file relative cluster
@@ -258,7 +260,7 @@ pcbmap(struct denode *dep, uint32_t findcn, daddr_t *bnp, 
uint32_t *cnp,
return (0);
}
 
-hiteof:;
+hiteof:
if (cnp)
*cnp = i;
if (bp)



Looking for testers with a floppy drive

2017-08-13 Thread Stefan Fritsch
Hi,

is anyone still using floppies? If yes, it would be nice if they could 
give the diff below a test. It defers probing of the drives which reduces 
boot time quite a bit. If floppy does not work with the diff, please also 
test if it works without it. On qemu, floppy access seems to be broken (or 
I did something wrong).

Cheers,
Stefan


diff --git a/sys/dev/isa/fd.c b/sys/dev/isa/fd.c
index 3e16d054428..460830a97cc 100644
--- a/sys/dev/isa/fd.c
+++ b/sys/dev/isa/fd.c
@@ -210,11 +210,11 @@ fdprobe(struct device *parent, void *match, void *aux)
/* select drive and turn on motor */
bus_space_write_1(iot, ioh, fdout, drive | FDO_FRST | FDO_MOEN(drive));
/* wait for motor to spin up */
-   delay(25);
+   tsleep(fdc, 0, "fdprobe", 250 * hz / 1000);
out_fdc(iot, ioh, NE7CMD_RECAL);
out_fdc(iot, ioh, drive);
/* wait for recalibrate */
-   delay(200);
+   tsleep(fdc, 0, "fdprobe", 2000 * hz / 1000);
out_fdc(iot, ioh, NE7CMD_SENSEI);
n = fdcresult(fdc);
 #ifdef FD_DEBUG
@@ -228,7 +228,7 @@ fdprobe(struct device *parent, void *match, void *aux)
 #endif
 
/* turn off motor */
-   delay(25);
+   tsleep(fdc, 0, "fdprobe", 250 * hz / 1000);
bus_space_write_1(iot, ioh, fdout, FDO_FRST);
 
/* flags & 0x20 forces the drive to be found even if it won't probe */
diff --git a/sys/dev/isa/fdc.c b/sys/dev/isa/fdc.c
index c7b7ae4562c..ee195367322 100644
--- a/sys/dev/isa/fdc.c
+++ b/sys/dev/isa/fdc.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -79,6 +80,8 @@
 /* controller driver configuration */
 int fdcprobe(struct device *, void *, void *);
 void fdcattach(struct device *, struct device *, void *);
+void fdcattach_deferred(void *);
+void fdc_create_kthread(void *);
 
 struct cfattach fdc_ca = {
sizeof(struct fdc_softc), fdcprobe, fdcattach
@@ -139,8 +142,6 @@ fdcattach(struct device *parent, struct device *self, void 
*aux)
bus_space_handle_t ioh;
bus_space_handle_t ioh_ctl;
struct isa_attach_args *ia = aux;
-   struct fdc_attach_args fa;
-   int type;
 
iot = ia->ia_iot;
 
@@ -163,6 +164,22 @@ fdcattach(struct device *parent, struct device *self, void 
*aux)
fdc->sc_ih = isa_intr_establish(ia->ia_ic, ia->ia_irq, IST_EDGE,
IPL_BIO, fdcintr, fdc, fdc->sc_dev.dv_xname);
 
+   kthread_create_deferred(fdc_create_kthread, fdc);
+}
+
+void
+fdc_create_kthread(void *arg)
+{
+   kthread_create(fdcattach_deferred, arg, NULL, "fdcattach");
+}
+
+void
+fdcattach_deferred(void *arg)
+{
+   struct fdc_softc *fdc = arg;
+   struct fdc_attach_args fa;
+   int type;
+
 #if defined(__i386__) || defined(__amd64__)
/*
 * The NVRAM info only tells us about the first two disks on the
@@ -187,8 +204,9 @@ fdcattach(struct device *parent, struct device *self, void 
*aux)
else
 #endif
fa.fa_deftype = NULL;   /* unknown */
-   (void)config_found(self, (void *), fddprint);
+   (void)config_found(>sc_dev, (void *), fddprint);
}
+   kthread_exit(0);
 }
 
 /*



Re: em: workaround for ultra-low-power mode on i219V

2017-08-02 Thread Stefan Fritsch
On Tue, 1 Aug 2017, Stefan Fritsch wrote:
> For consumer NICs, the patch only does something on I218_LM_3/I218_V_3 or 
> I219* or newer.

Sorry, misread the code there. There are more i218 variants that are
affected by the patch.



em: workaround for ultra-low-power mode on i219V

2017-08-01 Thread Stefan Fritsch
Hi,

we have the problem that on some HP laptops with i219V, it sometimes 
happens that em fails to attach with this error:

em0: Hardware Initialization Failed
em0: Unable to initialize the hardware

It seems there was some previous discussion of this issue here: 
http://bugs.openbsd.narkive.com/95VRrEJX/hardware-initialization-failed-with-intel-i219-v

The most reliable way to reproduce it is by booting Windows 10 first and 
then rebooting into openbsd without switching the laptop off. But 
sometimes, it also happens without having booted Windows first. Also, we 
had a case where the broken state persisted even if the laptop was 
switched off for a short time.  After the laptop had been switched off for 
several minutes, it worked again.

We have never seen the problem with i219LM (both on Fujitsu and HP 
laptops).

The attached patch seems to fix the problem. It ports the 
e1000_disable_ulp_lpt_lp() logic from the FreeBSD driver to disable 
ultra-low-power (ULP) mode.

The code has been merged in a way to make the diff from FreeBSD minimal.  
For example, the SWFW register is called H2ME on newer chips, so a new 
define is introduced. Also, the em_toggle_lanphypc_pch_lpt() function is 
left as separate function even though only used in one place at the 
moment.


For consumer NICs, the patch only does something on I218_LM_3/I218_V_3 or 
I219* or newer.  I am not quite sure which server chips are matching 
em_pch_spt. Testers, comments, OKs are welcome.

Cheers,
Stefan

diff --git a/sys/dev/pci/if_em_hw.c b/sys/dev/pci/if_em_hw.c
index d108555..8b3a0a0 100644
--- a/sys/dev/pci/if_em_hw.c
+++ b/sys/dev/pci/if_em_hw.c
@@ -93,6 +93,8 @@ static int32_tem_init_lcd_from_nvm(struct em_hw *);
 static int32_t em_phy_no_cable_workaround(struct em_hw *);
 static voidem_init_rx_addrs(struct em_hw *);
 static voidem_initialize_hardware_bits(struct em_hw *);
+static voidem_toggle_lanphypc_pch_lpt(struct em_hw *);
+static int em_disable_ulp_lpt_lp(struct em_hw *hw, bool force);
 static boolean_t em_is_onboard_nvm_eeprom(struct em_hw *);
 static int32_t em_kumeran_lock_loss_workaround(struct em_hw *);
 static int32_t em_mng_enable_host_if(struct em_hw *);
@@ -1194,6 +1196,192 @@ em_initialize_hardware_bits(struct em_hw *hw)
}
 }
 
+/**
+ *  e1000_toggle_lanphypc_pch_lpt - toggle the LANPHYPC pin value
+ *  @hw: pointer to the HW structure
+ *
+ *  Toggling the LANPHYPC pin value fully power-cycles the PHY and is
+ *  used to reset the PHY to a quiescent state when necessary.
+ **/
+static void
+em_toggle_lanphypc_pch_lpt(struct em_hw *hw)
+{
+   uint32_t mac_reg;
+
+   DEBUGFUNC("e1000_toggle_lanphypc_pch_lpt");
+
+   /* Set Phy Config Counter to 50msec */
+   mac_reg = E1000_READ_REG(hw, FEXTNVM3);
+   mac_reg &= ~E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK;
+   mac_reg |= E1000_FEXTNVM3_PHY_CFG_COUNTER_50MSEC;
+   E1000_WRITE_REG(hw, FEXTNVM3, mac_reg);
+
+   /* Toggle LANPHYPC Value bit */
+   mac_reg = E1000_READ_REG(hw, CTRL);
+   mac_reg |= E1000_CTRL_LANPHYPC_OVERRIDE;
+   mac_reg &= ~E1000_CTRL_LANPHYPC_VALUE;
+   E1000_WRITE_REG(hw, CTRL, mac_reg);
+   E1000_WRITE_FLUSH(hw);
+   msec_delay(1);
+   mac_reg &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
+   E1000_WRITE_REG(hw, CTRL, mac_reg);
+   E1000_WRITE_FLUSH(hw);
+
+   if (hw->mac_type < em_pch_lpt) {
+   msec_delay(50);
+   } else {
+   uint16_t count = 20;
+
+   do {
+   msec_delay(5);
+   } while (!(E1000_READ_REG(hw, CTRL_EXT) &
+   E1000_CTRL_EXT_LPCD) && count--);
+
+   msec_delay(30);
+   }
+}
+
+/**
+ *  em_disable_ulp_lpt_lp - unconfigure Ultra Low Power mode for LynxPoint-LP
+ *  @hw: pointer to the HW structure
+ *  @force: boolean indicating whether or not to force disabling ULP
+ *
+ *  Un-configure ULP mode when link is up, the system is transitioned from
+ *  Sx or the driver is unloaded.  If on a Manageability Engine (ME) enabled
+ *  system, poll for an indication from ME that ULP has been un-configured.
+ *  If not on an ME enabled system, un-configure the ULP mode by software.
+ *
+ *  During nominal operation, this function is called when link is acquired
+ *  to disable ULP mode (force=FALSE); otherwise, for example when unloading
+ *  the driver or during Sx->S0 transitions, this is called with force=TRUE
+ *  to forcibly disable ULP.
+ */
+static int
+em_disable_ulp_lpt_lp(struct em_hw *hw, bool force)
+{
+   int ret_val = E1000_SUCCESS;
+   uint32_t mac_reg;
+   uint16_t phy_reg;
+   int i = 0;
+
+   if ((hw->mac_type < em_pch_lpt) ||
+   (hw->device_id == E1000_DEV_ID_PCH_LPT_I217_LM) ||
+   (hw->device_id == E1000_DEV_ID_PCH_LPT_I217_V) ||
+   (hw->device_id == E1000_DEV_ID_PCH_I218_LM2) ||
+   (hw->device_id == E1000_DEV_ID_PCH_I218_V2))
+   return 0;
+
+   if (E1000_READ_REG(hw, 

Re: Allowing devices to set a minimum wait time at power down/reboot

2017-07-10 Thread Stefan Fritsch
On Sun, 9 Jul 2017, Mark Kettenis wrote:
> > there are some open issues that usb devices do not flush their caches fast 
> > enough (at least on some boards). And my mail about shortening the wait 
> > times in reboot(8) brought up that issue again.
> > 
> > My suggestion would be to have a global variable that takes the minimum 
> > delay for the next shutdown/reboot and affected devices could set that 
> > value in their DVACT_SUSPEND hook. That way, if a device is removed before 
> > the shudown, the delay won't trigger.
> > 
> > The diff below is only compile tested by I think it demonstrates the idea.
> > 
> > One could also do something more fancy like using different delays for 
> > reboot and powerdown, or making umass only set a new SDEV_ flag and then 
> > have the scsi layer only call set_powerdown_delay() if the device has been 
> > written to.
> > 
> > Opinions?
> 
> Such devices should make sure they flush their caches and properly
> wait for that operation to complete.

That would be preferable, but there is some evidence that not all usb 
thumbdrives implement cache flush correctly:

http://marc.info/?l=openbsd-misc=144237305322074=2
http://marc.info/?l=openbsd-tech=147688940628911=2

The fact that it only happens on a few mainboards could be explained if it 
depends on how quickly the mainboard powers off the usb bus on shutdown 
(and reset).



Allowing devices to set a minimum wait time at power down/reboot

2017-07-09 Thread Stefan Fritsch
Hi,

there are some open issues that usb devices do not flush their caches fast 
enough (at least on some boards). And my mail about shortening the wait 
times in reboot(8) brought up that issue again.

My suggestion would be to have a global variable that takes the minimum 
delay for the next shutdown/reboot and affected devices could set that 
value in their DVACT_SUSPEND hook. That way, if a device is removed before 
the shudown, the delay won't trigger.

The diff below is only compile tested by I think it demonstrates the idea.

One could also do something more fancy like using different delays for 
reboot and powerdown, or making umass only set a new SDEV_ flag and then 
have the scsi layer only call set_powerdown_delay() if the device has been 
written to.

Opinions?

Cheers,
Stefan

diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c
index 9653de19924..ed4bb545ec7 100644
--- a/sys/arch/amd64/amd64/machdep.c
+++ b/sys/arch/amd64/amd64/machdep.c
@@ -86,6 +86,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -715,6 +716,7 @@ struct pcb dumppcb;
 __dead void
 boot(int howto)
 {
+   unsigned delay_ms;
if ((howto & RB_POWERDOWN) != 0)
lid_action = 0;
 
@@ -756,7 +758,8 @@ haltsys:
extern int acpi_enabled;
 
if (acpi_enabled) {
-   delay(50);
+   delay_ms = MAX(500, powerdown_delay);
+   delay(delay_ms * 1000);
if ((howto & RB_POWERDOWN) != 0)
acpi_powerdown();
}
@@ -770,8 +773,9 @@ haltsys:
}
 
printf("rebooting...\n");
-   if (cpureset_delay > 0)
-   delay(cpureset_delay * 1000);
+   delay_ms = MAX(cpureset_delay, powerdown_delay);
+   if (delay_ms > 0)
+   delay(delay_ms * 1000);
cpu_reset();
for (;;)
continue;
diff --git a/sys/dev/acpi/acpi.c b/sys/dev/acpi/acpi.c
index 09495657aad..d2380d6d78d 100644
--- a/sys/dev/acpi/acpi.c
+++ b/sys/dev/acpi/acpi.c
@@ -2444,6 +2444,10 @@ acpi_sleep_state(struct acpi_softc *sc, int sleepmode)
acpi_disable_allgpes(sc);
acpi_enable_wakegpes(sc, state);
 
+   if (powerdown_delay > 0) {
+   delay(1000 * powerdown_delay);
+   powerdown_delay = 0;
+   }
/* Sleep */
sc->sc_state = state;
error = acpi_sleep_cpu(sc, state);
diff --git a/sys/dev/usb/umass.c b/sys/dev/usb/umass.c
index 0fc1782ac21..8ff61b29c76 100644
--- a/sys/dev/usb/umass.c
+++ b/sys/dev/usb/umass.c
@@ -131,6 +131,7 @@
 #include 
 #include 
 #include 
+#include 
 #undef KASSERT
 #define KASSERT(cond, msg)
 #include 
@@ -178,13 +179,15 @@ char *states[TSTATE_STATES+1] = {
 int umass_match(struct device *, void *, void *); 
 void umass_attach(struct device *, struct device *, void *); 
 int umass_detach(struct device *, int); 
+int umass_activate(struct device *, int);
 
 struct cfdriver umass_cd = { 
NULL, "umass", DV_DULL 
 }; 
 
 const struct cfattach umass_ca = {
-   sizeof(struct umass_softc), umass_match, umass_attach, umass_detach
+   sizeof(struct umass_softc), umass_match, umass_attach, umass_detach,
+   umass_activate
 };
 
 void umass_disco(struct umass_softc *sc);
@@ -651,6 +654,20 @@ umass_detach(struct device *self, int flags)
return (rv);
 }
 
+int umass_activate(struct device *self, int act)
+{
+   int rv = 0;
+
+   switch (act) {
+   case DVACT_SUSPEND:
+   set_powerdown_delay(2000);
+   break;
+   }
+   rv = config_activate_children(self, act);
+
+   return (rv);
+}
+
 void
 umass_disco(struct umass_softc *sc)
 {
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
index 975c6bc34d8..a3eeb48bc7f 100644
--- a/sys/kern/kern_xxx.c
+++ b/sys/kern/kern_xxx.c
@@ -40,6 +40,19 @@
 #include 
 #include 
 
+/*
+ * Minimum delay in ms before actual powerdown/reboot, in order
+ * to give time to devices for flushing caches.
+ */
+unsigned int powerdown_delay;
+
+void
+set_powerdown_delay(unsigned int ms)
+{
+   if (powerdown_delay < ms)
+   powerdown_delay = ms;
+}
+
 int
 sys_reboot(struct proc *p, void *v, register_t *retval)
 {
diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h
index f82d87a8e9d..a58e772ad6f 100644
--- a/sys/sys/reboot.h
+++ b/sys/sys/reboot.h
@@ -96,8 +96,11 @@
 #ifdefined(_KERNEL) && !defined(_STANDALONE) && !defined(_LOCORE)
 
 __BEGIN_DECLS
+extern unsigned int powerdown_delay;
+
 __dead voidreboot(int);
 __dead voidboot(int);
+void set_powerdown_delay(unsigned int);
 __END_DECLS
 
 #endif /* _KERNEL */



Re: reboot: Only wait for procs if there are any

2017-07-09 Thread Stefan Fritsch
On Sun, 9 Jul 2017, Janne Johansson wrote:

> One guess would be that the post-sync() sleep might give "async" devices a
> chance to flush out I/O, so shortening that one just because no pid is
> killable might change peoples experiences.

There are actually two sync calls, the first is before the first sleep. 
But more importantly, waiting for devices to flush out I/O should be done 
by the respective drivers in the kernel. That way the wait only affects 
those people who need it. I will send a separate mail about this.

> 
> 
> 2017-07-09 9:42 GMT+02:00 Stefan Fritsch <s...@sfritsch.de>:
> 
> > Hi,
> >
> > what about only waiting for processes to die on reboot if there are any
> > processes left? Nice for the frequently rebooting kernel developer.
> >
> > This usually saves at least one second and often 4-5 seconds when calling
> > reboot via ssh or 'exec reboot' from a console (otherwise the parent shell
> > won't exit before the first SIGKILL).
> >
> > ok?
> >
> > Cheers,
> > Stefan
> >
> > diff --git sbin/reboot/reboot.c sbin/reboot/reboot.c
> > index dd85d0d9c8c..5556514295e 100644
> > --- sbin/reboot/reboot.c
> > +++ sbin/reboot/reboot.c
> > @@ -57,6 +57,17 @@ int  dohalt;
> >
> >  #define _PATH_RC   "/etc/rc"
> >
> > +static void
> > +sleep_while_procs(int seconds)
> > +{
> > +   while (seconds > 0) {
> > +   if (kill(-1, 0) == -1 && errno == ESRCH)
> > +   return;
> > +   sleep(1);
> > +   seconds--;
> > +   }
> > +}
> > +
> >  int
> >  main(int argc, char *argv[])
> >  {
> > @@ -232,10 +243,10 @@ main(int argc, char *argv[])
> >  * buffers on their way.  Wait 5 seconds between the SIGTERM and
> >  * the SIGKILL to give everybody a chance.
> >  */
> > -   sleep(2);
> > +   sleep_while_procs(2);
> > if (!nflag)
> > sync();
> > -   sleep(3);
> > +   sleep_while_procs(3);
> >
> > for (i = 1;; ++i) {
> > if (kill(-1, SIGKILL) == -1) {
> > @@ -247,7 +258,7 @@ main(int argc, char *argv[])
> > warnx("WARNING: some process(es) wouldn't die");
> > break;
> > }
> > -   (void)sleep(2 * i);
> > +   sleep_while_procs(2 * i);
> > }
> >
> > reboot(howto);
> >
> >
> 
> 
> 



reboot: Only wait for procs if there are any

2017-07-09 Thread Stefan Fritsch
Hi,

what about only waiting for processes to die on reboot if there are any 
processes left? Nice for the frequently rebooting kernel developer.

This usually saves at least one second and often 4-5 seconds when calling 
reboot via ssh or 'exec reboot' from a console (otherwise the parent shell 
won't exit before the first SIGKILL).

ok?

Cheers,
Stefan

diff --git sbin/reboot/reboot.c sbin/reboot/reboot.c
index dd85d0d9c8c..5556514295e 100644
--- sbin/reboot/reboot.c
+++ sbin/reboot/reboot.c
@@ -57,6 +57,17 @@ int  dohalt;
 
 #define _PATH_RC   "/etc/rc"
 
+static void
+sleep_while_procs(int seconds)
+{
+   while (seconds > 0) {
+   if (kill(-1, 0) == -1 && errno == ESRCH)
+   return;
+   sleep(1);
+   seconds--;
+   }
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -232,10 +243,10 @@ main(int argc, char *argv[])
 * buffers on their way.  Wait 5 seconds between the SIGTERM and
 * the SIGKILL to give everybody a chance.
 */
-   sleep(2);
+   sleep_while_procs(2);
if (!nflag)
sync();
-   sleep(3);
+   sleep_while_procs(3);
 
for (i = 1;; ++i) {
if (kill(-1, SIGKILL) == -1) {
@@ -247,7 +258,7 @@ main(int argc, char *argv[])
warnx("WARNING: some process(es) wouldn't die");
break;
}
-   (void)sleep(2 * i);
+   sleep_while_procs(2 * i);
}
 
reboot(howto);



De-hole struct sel_info

2017-07-09 Thread Stefan Fritsch
This reduces the size of struct selinfo from 24 to 16 bytes on 64bit 
archs.

struct sel_info is embedded in quite a few other structs (tty, kevent, 
sockbuf, ...). So this may affect libkvm. What's the procedure for 
changing such a struct?

Cheers,
Stefan

diff --git sys/sys/selinfo.h sys/sys/selinfo.h
index 8235c8c0944..9c9f1c7a153 100644
--- sys/sys/selinfo.h
+++ sys/sys/selinfo.h
@@ -41,8 +41,8 @@
  * notified when I/O becomes possible.
  */
 struct selinfo {
-   pid_t   si_seltid;  /* thread to be notified */
struct  klist si_note;  /* kernel note list */
+   pid_t   si_seltid;  /* thread to be notified */
short   si_flags;   /* see below */
 };
 #defineSI_COLL 0x0001  /* collision occurred */



Re: Copying a file on msdos FS (fat32) changes content

2017-06-14 Thread Stefan Fritsch
On Tue, 13 Jun 2017, Martijn Rijkeboer wrote:
> Yes, this patch fixes the problem.

Thanks for the report and the testing. I have committed the revert 
yesterday.



Re: Copying a file on msdos FS (fat32) changes content

2017-06-12 Thread Stefan Fritsch
On Mon, 12 Jun 2017, Martijn Rijkeboer wrote:
> After upgrading to the latest snapshot there seems to be something wrong
> with the msdos filesystem driver. When I copy a binary file on a msdos (fat32)
> mounted partition the content changes e.g.:
> 
>   # cp refind_x64.efi bootx64.efi
>   # ls -l refind_x64.efi bootx64.efi
>   -rw-r--r--  1 root  wheel  230856 Jun 12 10:47 bootx64.efi
>   -rw-r--r--  1 root  wheel  230856 Mar  9 11:09 refind_x64.efi
>   # sha256 refind_x64.efi bootx64.efi
>   SHA256 (refind_x64.efi) =
>   9c9a38f168fed270c158ff5a68d3fa5172eb15bcb41662abf69faa13d2abc418
>   SHA256 (bootx64.efi) =
>   26f7350143cc897d53b0257dbf5d9f1d84eace4746cbd9f2ff95a033ee0c577f

Sigh.

Please try if the attached patch fixes the problem. It reverts a likely 
culprit.

Cheers,
Stefan

diff --git sys/msdosfs/denode.h sys/msdosfs/denode.h
index efa8192a06d..cdca90500ab 100644
--- sys/msdosfs/denode.h
+++ sys/msdosfs/denode.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: denode.h,v 1.31 2017/05/29 13:48:12 sf Exp $  */
+/* $OpenBSD: denode.h,v 1.30 2016/08/30 19:47:23 sf Exp $  */
 /* $NetBSD: denode.h,v 1.24 1997/10/17 11:23:39 ws Exp $   */
 
 /*-
@@ -142,6 +142,7 @@ struct denode {
struct vnode *de_devvp; /* vnode of blk dev we live on */
uint32_t de_flag;   /* flag bits */
dev_t de_dev;   /* device where direntry lives */
+   daddr_t de_lastr;
uint32_t de_dirclust;   /* cluster of the directory file containing 
this entry */
uint32_t de_diroffset;  /* offset of this entry in the directory 
cluster */
uint32_t de_fndoffset;  /* offset of found dir entry */
diff --git sys/msdosfs/msdosfs_vnops.c sys/msdosfs/msdosfs_vnops.c
index 61b45af6185..39c60044df5 100644
--- sys/msdosfs/msdosfs_vnops.c
+++ sys/msdosfs/msdosfs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: msdosfs_vnops.c,v 1.114 2017/05/29 13:48:12 sf Exp $  */
+/* $OpenBSD: msdosfs_vnops.c,v 1.113 2016/08/30 19:47:23 sf Exp $  */
 /* $NetBSD: msdosfs_vnops.c,v 1.63 1997/10/17 11:24:19 ws Exp $*/
 
 /*-
@@ -77,13 +77,13 @@
 
 static uint32_t fileidhash(uint64_t);
 
-int msdosfs_bmaparray(struct vnode *, daddr_t, daddr_t *, int *);
 int msdosfs_kqfilter(void *);
 int filt_msdosfsread(struct knote *, long);
 int filt_msdosfswrite(struct knote *, long);
 int filt_msdosfsvnode(struct knote *, long);
 void filt_msdosfsdetach(struct knote *);
 
+
 /*
  * Some general notes:
  *
@@ -509,14 +509,18 @@ int
 msdosfs_read(void *v)
 {
struct vop_read_args *ap = v;
+   int error = 0;
+   uint32_t diff;
+   int blsize;
+   int isadir;
+   uint32_t n;
+   long on;
+   daddr_t lbn, rablock, rablkno;
+   struct buf *bp;
struct vnode *vp = ap->a_vp;
struct denode *dep = VTODE(vp);
struct msdosfsmount *pmp = dep->de_pmp;
struct uio *uio = ap->a_uio;
-   int isadir, error = 0;
-   uint32_t n, diff, size, on;
-   struct buf *bp;
-   daddr_t cn, bn;
 
/*
 * If they didn't ask for any data, then we are done.
@@ -531,8 +535,7 @@ msdosfs_read(void *v)
if (uio->uio_offset >= dep->de_FileSize)
return (0);
 
-   cn = de_cluster(pmp, uio->uio_offset);
-   size = pmp->pm_bpcluster;
+   lbn = de_cluster(pmp, uio->uio_offset);
on = uio->uio_offset & pmp->pm_crbomask;
n = ulmin(pmp->pm_bpcluster - on, uio->uio_resid);
 
@@ -545,22 +548,31 @@ msdosfs_read(void *v)
if (diff < n)
n = diff;
 
+   /* convert cluster # to block # if a directory */
+   if (isadir) {
+   error = pcbmap(dep, lbn, , 0, );
+   if (error)
+   return (error);
+   }
/*
 * If we are operating on a directory file then be sure to
 * do i/o with the vnode for the filesystem instead of the
 * vnode for the directory.
 */
if (isadir) {
-   error = pcbmap(dep, cn, , 0, );
-   if (error)
-   return (error);
-   error = bread(pmp->pm_devvp, cn, size, );
+   error = bread(pmp->pm_devvp, lbn, blsize, );
} else {
-   bn = de_cn2bn(pmp, cn);
-   if (de_cn2off(pmp, cn + 1) >= dep->de_FileSize)
-   error = bread(vp, bn, size, );
+   rablock = lbn + 1;
+   rablkno = de_cn2bn(pmp, rablock);
+   if (dep->de_lastr + 1 == lbn &&
+   de_cn2off(pmp, rablock) < dep->de_FileSize)
+   error = breadn(vp, de_cn2bn(pmp, lbn),
+   pmp->pm_bpcluster, ,
+   

ext2: flush cache on mount -ur

2017-05-30 Thread Stefan Fritsch
The same as is already implemented for ffs & msdosfs.

ok?

Cheers,
Stefan


diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c
index 53eaa05a32a..98d5536418c 100644
--- a/sys/ufs/ext2fs/ext2fs_vfsops.c
+++ b/sys/ufs/ext2fs/ext2fs_vfsops.c
@@ -181,6 +181,7 @@ ext2fs_mount(struct mount *mp, const char *path, void *data,
ump = VFSTOUFS(mp);
fs = ump->um_e2fs;
if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+   int force = 0;
flags = WRITECLOSE;
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
@@ -193,6 +194,12 @@ ext2fs_mount(struct mount *mp, const char *path, void 
*data,
}
if (error)
return (error);
+   /*
+* Updating mount to readonly. Try a cache flush.
+* Ignore error because the ioctl may not be supported.
+*/
+   VOP_IOCTL(ump->um_devvp, DIOCCACHESYNC, ,
+   FWRITE, FSCRED, p);
fs->e2fs_ronly = 1;
}
if (mp->mnt_flag & MNT_RELOAD) {



ext2fs: Mark superblock as not modified when written

2017-05-30 Thread Stefan Fritsch
I have seen spurious "file system not clean; please fsck(8)" warnings
during "mount -ur". Set e2fs_fmod = 0 when writing the superblock (as
ffs does).

ok?

Cheers,
Stefan


diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c
index 98d5536418c..372c4d6f1fc 100644
--- a/sys/ufs/ext2fs/ext2fs_vfsops.c
+++ b/sys/ufs/ext2fs/ext2fs_vfsops.c
@@ -1007,6 +1007,7 @@ ext2fs_sbupdate(struct ufsmount *mp, int waitfor)
error = bwrite(bp);
else
bawrite(bp);
+   fs->e2fs_fmod = 0;
return (error);
 }
 



Re: Fix for "Implement VFS read clustering for MSDOSFS"

2017-05-29 Thread Stefan Fritsch
On Mon, 29 May 2017, Martin Pieuchot wrote:

> A complete diff would be easier to review.

ok, here it comes.

The original commit message was this:

  Implement VFS read clustering for MSDOSFS.

  The logic used in msdosfs_bmap() to loop calling pcbmap() comes from
  FreeBSD and is not really efficient but it is good enough since it is
  only called when generating I/O.

  With this diff I get a 100% improvement when reading big files from a
  crappy USB stick.

  With this and bread_cluster(9) modified to not re-fetch B_CACHED buffers,
  reading large contiguous files with chunk sizes of MAXPHYS is almost as
  fast as physio(9) on the same device.

  For a 'real world' example, when copying music files from a USB stick I
  see a speed jump from 15MB/s on -current to 24Mb/s with this diff.

  While here rename some 'lbn' variables into 'cn' to better reflect what
  we're dealing with.

  Tested by Mathieu, with support from deraadt@

diff --git sys/msdosfs/denode.h sys/msdosfs/denode.h
index cdca90500ab..c3a413313c1 100644
--- sys/msdosfs/denode.h
+++ sys/msdosfs/denode.h
@@ -142,7 +142,6 @@ struct denode {
struct vnode *de_devvp; /* vnode of blk dev we live on */
uint32_t de_flag;   /* flag bits */
dev_t de_dev;   /* device where direntry lives */
-   daddr_t de_lastr;
uint32_t de_dirclust;   /* cluster of the directory file containing 
this entry */
uint32_t de_diroffset;  /* offset of this entry in the directory 
cluster */
uint32_t de_fndoffset;  /* offset of found dir entry */
diff --git sys/msdosfs/msdosfs_vnops.c sys/msdosfs/msdosfs_vnops.c
index 39c60044df5..783bd0d1d31 100644
--- sys/msdosfs/msdosfs_vnops.c
+++ sys/msdosfs/msdosfs_vnops.c
@@ -77,13 +77,13 @@
 
 static uint32_t fileidhash(uint64_t);
 
+int msdosfs_bmaparray(struct vnode *, daddr_t, daddr_t *, int *);
 int msdosfs_kqfilter(void *);
 int filt_msdosfsread(struct knote *, long);
 int filt_msdosfswrite(struct knote *, long);
 int filt_msdosfsvnode(struct knote *, long);
 void filt_msdosfsdetach(struct knote *);
 
-
 /*
  * Some general notes:
  *
@@ -509,18 +509,14 @@ int
 msdosfs_read(void *v)
 {
struct vop_read_args *ap = v;
-   int error = 0;
-   uint32_t diff;
-   int blsize;
-   int isadir;
-   uint32_t n;
-   long on;
-   daddr_t lbn, rablock, rablkno;
-   struct buf *bp;
struct vnode *vp = ap->a_vp;
struct denode *dep = VTODE(vp);
struct msdosfsmount *pmp = dep->de_pmp;
struct uio *uio = ap->a_uio;
+   int isadir, error = 0;
+   uint32_t n, diff, size, on;
+   struct buf *bp;
+   daddr_t cn, bn;
 
/*
 * If they didn't ask for any data, then we are done.
@@ -535,7 +531,8 @@ msdosfs_read(void *v)
if (uio->uio_offset >= dep->de_FileSize)
return (0);
 
-   lbn = de_cluster(pmp, uio->uio_offset);
+   cn = de_cluster(pmp, uio->uio_offset);
+   size = pmp->pm_bpcluster;
on = uio->uio_offset & pmp->pm_crbomask;
n = ulmin(pmp->pm_bpcluster - on, uio->uio_resid);
 
@@ -548,31 +545,22 @@ msdosfs_read(void *v)
if (diff < n)
n = diff;
 
-   /* convert cluster # to block # if a directory */
-   if (isadir) {
-   error = pcbmap(dep, lbn, , 0, );
-   if (error)
-   return (error);
-   }
/*
 * If we are operating on a directory file then be sure to
 * do i/o with the vnode for the filesystem instead of the
 * vnode for the directory.
 */
if (isadir) {
-   error = bread(pmp->pm_devvp, lbn, blsize, );
+   error = pcbmap(dep, cn, , 0, );
+   if (error)
+   return (error);
+   error = bread(pmp->pm_devvp, cn, size, );
} else {
-   rablock = lbn + 1;
-   rablkno = de_cn2bn(pmp, rablock);
-   if (dep->de_lastr + 1 == lbn &&
-   de_cn2off(pmp, rablock) < dep->de_FileSize)
-   error = breadn(vp, de_cn2bn(pmp, lbn),
-   pmp->pm_bpcluster, ,
-   >pm_bpcluster, 1, );
+   bn = de_cn2bn(pmp, cn);
+   if (de_cn2off(pmp, cn + 1) >= dep->de_FileSize)
+   error = bread(vp, bn, size, );
else
-   error = bread(vp, de_cn2bn(pmp, lbn),
-   pmp->pm_bpcluster, );
-   dep->de_lastr = lbn;
+

Fix for "Implement VFS read clustering for MSDOSFS"

2017-05-29 Thread Stefan Fritsch
Last year, mpi@ implemented VFS read clustering for MSDOSFS in

sys/msdosfs/denode.h 1.28
sys/msdosfs/msdosfs_vnops.c 1.105

This caused regressions when doing seeks past the end of the file and had 
to be reverted.

I have now written a test that catches the bug (committed in 
regress/sys/fileops) and got a fix that does not break the test:

There was some confusion as to what block sizes are used in file
operations. Openbsd seems to be more like netbsd than like freebsd.

See msdosfs_vnops.c 1.62 in freebsd:

Author: msmith 
Date:   Sun Mar 1 21:26:09 1998 +

Fix mmap() on msdosfs.  In the words of the submitter:

|In the process of evaluating the getpages/putpages issues I discovered
|that mmap on MSDOSFS does not work. This is because I blindly merged
|NetBSD changes in msdosfs_bmap and msdosfs_strategy. Apparently, their
|blocksize is always DEV_BSIZE (even in files), while in FreeBSD
|blocksize in files is v_mount->mnt_stat.f_iosize (i.e. clustersize in
|MSDOSFS case). The patch is below.

Submitted by:   Dmitrij Tejblum 

The diff is meant to be on top of mpi's diff (or on top of a revert of the 
revert 1.113). I can send the complete diff if that's preferred.

diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c
index 45a479fb646..46ff2736ca4 100644
--- sys/msdosfs/msdosfs_vnops.c
+++ sys/msdosfs/msdosfs_vnops.c
@@ -516,7 +516,7 @@ msdosfs_read(void *v)
int isadir, error = 0;
uint32_t n, diff, size, on;
struct buf *bp;
-   daddr_t cn;
+   daddr_t cn, bn;
 
/*
 * If they didn't ask for any data, then we are done.
@@ -556,10 +556,11 @@ msdosfs_read(void *v)
return (error);
error = bread(pmp->pm_devvp, cn, size, );
} else {
+   bn = de_cn2bn(pmp, cn);
if (de_cn2off(pmp, cn + 1) >= dep->de_FileSize)
-   error = bread(vp, cn, size, );
+   error = bread(vp, bn, size, );
else
-   error = bread_cluster(vp, cn, size, );
+   error = bread_cluster(vp, bn, size, );
}
n = min(n, pmp->pm_bpcluster - bp->b_resid);
if (error) {
@@ -588,7 +589,7 @@ msdosfs_write(void *v)
uint32_t osize;
int error = 0;
uint32_t count, lastcn;
-   daddr_t cn;
+   daddr_t cn, bn;
struct buf *bp;
int ioflag = ap->a_ioflag;
struct uio *uio = ap->a_uio;
@@ -681,18 +682,19 @@ msdosfs_write(void *v)
 * or we write the cluster from its start beyond EOF,
 * then no need to read data from disk.
 */
-   bp = getblk(thisvp, cn, pmp->pm_bpcluster, 0, 0);
+   bn = de_cn2bn(pmp, cn);
+   bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0);
clrbuf(bp);
/*
 * Do the bmap now, since pcbmap needs buffers
 * for the fat table. (see msdosfs_strategy)
 */
if (bp->b_blkno == bp->b_lblkno) {
-   error = pcbmap(dep, bp->b_lblkno, , 0, 0);
+   error = pcbmap(dep,
+  de_bn2cn(pmp, bp->b_lblkno),
+  >b_blkno, 0, 0);
if (error)
bp->b_blkno = -1;
-   else
-   bp->b_blkno = cn;
}
if (bp->b_blkno == -1) {
brelse(bp);
@@ -705,7 +707,8 @@ msdosfs_write(void *v)
 * The block we need to write into exists, so
 * read it in.
 */
-   error = bread(thisvp, cn, pmp->pm_bpcluster, );
+   bn = de_cn2bn(pmp, cn);
+   error = bread(thisvp, bn, pmp->pm_bpcluster, );
if (error) {
brelse(bp);
break;
@@ -1820,7 +1823,8 @@ msdosfs_strategy(void *v)
 * don't allow files with holes, so we shouldn't ever see this.
 */
if (bp->b_blkno == bp->b_lblkno) {
-   error = pcbmap(dep, bp->b_lblkno, >b_blkno, 0, 0);
+   error = pcbmap(dep, de_bn2cn(dep->de_pmp, bp->b_lblkno),
+  >b_blkno, 0, 0);
if (error)
bp->b_blkno = -1;
if (bp->b_blkno == -1)



msdosfs & ffs: flush cache if updating mount to r/o

2017-05-28 Thread Stefan Fritsch
Other file systems can be changed later.

diff --git sys/msdosfs/msdosfs_vfsops.c sys/msdosfs/msdosfs_vfsops.c
index e13b0b1ea0b..5cd273c4087 100644
--- sys/msdosfs/msdosfs_vfsops.c
+++ sys/msdosfs/msdosfs_vfsops.c
@@ -64,6 +64,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -128,8 +129,14 @@ msdosfs_mount(struct mount *mp, const char *path, void 
*data,
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
error = vflush(mp, NULLVP, flags);
-   if (!error)
+   if (!error) {
+   int force = 0;
+
pmp->pm_flags |= MSDOSFSMNT_RONLY;
+   /* may be not supported, ignore error */
+   VOP_IOCTL(pmp->pm_devvp, DIOCCACHESYNC,
+   , FWRITE, FSCRED, p);
+   }
}
if (!error && (mp->mnt_flag & MNT_RELOAD))
/* not yet implemented */
diff --git sys/ufs/ffs/ffs_vfsops.c sys/ufs/ffs/ffs_vfsops.c
index 0770e9aa45d..3714c0c09cc 100644
--- sys/ufs/ffs/ffs_vfsops.c
+++ sys/ufs/ffs/ffs_vfsops.c
@@ -456,6 +456,16 @@ success:
fs->fs_flags &= ~FS_DOSOFTDEP;
}
ffs_sbupdate(ump, MNT_WAIT);
+   if (ronly) {
+   int force = 0;
+
+   /*
+* Updating mount to readonly. Try a cache flush.
+* Ignore error because the ioctl may not be supported.
+*/
+   VOP_IOCTL(ump->um_devvp, DIOCCACHESYNC, ,
+   FWRITE, FSCRED, p);
+   }
}
return (0);
 



Add DIOCCACHESYNC ioctl

2017-05-28 Thread Stefan Fritsch
Add an ioctl to tell storage devices to flush their internal caches.
Initially ported from netbsd by pedro@

I will implement it for nvme later (if we go ahead with this).


diff --git sys/dev/ata/wd.c sys/dev/ata/wd.c
index cd168223672..859fb454f44 100644
--- sys/dev/ata/wd.c
+++ sys/dev/ata/wd.c
@@ -26,7 +26,7 @@
  */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -138,7 +138,7 @@ void  wdstart(void *);
 void  __wdstart(struct wd_softc*, struct buf *);
 void  wdrestart(void *);
 int   wd_get_params(struct wd_softc *, u_int8_t, struct ataparams *);
-void  wd_flushcache(struct wd_softc *, int);
+int   wd_flushcache(struct wd_softc *, int);
 void  wd_standby(struct wd_softc *, int);
 
 /* XXX: these should go elsewhere */
@@ -848,6 +848,14 @@ wdioctl(dev_t dev, u_long xfer, caddr_t addr, int flag, 
struct proc *p)
}
 #endif
 
+   case DIOCCACHESYNC:
+   if ((flag & FWRITE) == 0) {
+   error = EBADF;
+   goto exit;
+   }
+   error = wd_flushcache(wd, AT_WAIT);
+   goto exit;
+
default:
error = wdc_ioctl(wd->drvp, xfer, addr, flag, p);
goto exit;
@@ -1067,13 +1075,13 @@ wd_get_params(struct wd_softc *wd, u_int8_t flags, 
struct ataparams *params)
}
 }
 
-void
+int
 wd_flushcache(struct wd_softc *wd, int flags)
 {
struct wdc_command wdc_c;
 
if (wd->drvp->ata_vers < 4) /* WDCC_FLUSHCACHE is here since ATA-4 */
-   return;
+   return EIO;
bzero(_c, sizeof(struct wdc_command));
wdc_c.r_command = (wd->sc_flags & WDF_LBA48 ? WDCC_FLUSHCACHE_EXT :
WDCC_FLUSHCACHE);
@@ -1088,20 +1096,28 @@ wd_flushcache(struct wd_softc *wd, int flags)
if (wdc_exec_command(wd->drvp, _c) != WDC_COMPLETE) {
printf("%s: flush cache command didn't complete\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
+   if (wdc_c.flags & ERR_NODEV)
+   return ENODEV;
if (wdc_c.flags & AT_TIMEOU) {
printf("%s: flush cache command timeout\n",
wd->sc_dev.dv_xname);
+   return EIO;
+   }
+   if (wdc_c.flags & AT_ERROR) {
+   if (wdc_c.r_error == WDCE_ABRT) /* command not supported */
+   return ENODEV;
+   printf("%s: flush cache command: error 0x%x\n",
+   wd->sc_dev.dv_xname, wdc_c.r_error);
+   return EIO;
}
if (wdc_c.flags & AT_DF) {
printf("%s: flush cache command: drive fault\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
-   /*
-* Ignore error register, it shouldn't report anything else
-* than COMMAND ABORTED, which means the device doesn't support
-* flush cache
-*/
+   return 0;
 }
 
 void
diff --git sys/scsi/sd.c sys/scsi/sd.c
index b964f803da3..4cd0e4d13bd 100644
--- sys/scsi/sd.c
+++ sys/scsi/sd.c
@@ -2,7 +2,7 @@
 /* $NetBSD: sd.c,v 1.111 1997/04/02 02:29:41 mycroft Exp $ */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -96,7 +96,7 @@ int   sd_vpd_block_limits(struct sd_softc *, int);
 intsd_vpd_thin(struct sd_softc *, int);
 intsd_thin_params(struct sd_softc *, int);
 intsd_get_parms(struct sd_softc *, struct disk_parms *, int);
-void   sd_flush(struct sd_softc *, int);
+intsd_flush(struct sd_softc *, int);
 
 void   viscpy(u_char *, u_char *, int);
 
@@ -999,6 +999,15 @@ sdioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, 
struct proc *p)
error = sd_ioctl_cache(sc, cmd, (struct dk_cache *)addr);
goto exit;
 
+   case DIOCCACHESYNC:
+   if (!ISSET(flag, FWRITE)) {
+   error = EBADF;
+   goto exit;
+   }
+   if ((sc->flags & SDF_DIRTY) != 0 || *(int *)addr != 0)
+   error = sd_flush(sc, 0);
+   goto exit;
+
default:
if (part != RAW_PART) {
error = ENOTTY;
@@ -1876,19 +1885,20 @@ die:
return (SDGP_RESULT_OFFLINE);
 }
 
-void
+int
 sd_flush(struct sd_softc *sc, int flags)
 {
struct scsi_link *link;
struct scsi_xfer *xs;
struct scsi_synchronize_cache *cmd;
+   int error;
 
if (sc->flags & SDF_DYING)
-   return;
+   return (ENXIO);
link = sc->sc_link;
 
if (link->quirks & SDEV_NOSYNCCACHE)
-   return;
+   return (0);
 
/*
  

Flush disk cache on mount -ur

2017-05-28 Thread Stefan Fritsch
Hi,

I want to come back to the discussion from Oct. 2016 on tech@ about a
disk cache flush ioctl.


The problem we want to solve is that in case of power loss, there should 
be no data loss on partitions that are either mounted read-only or are 
unmounted. This should also be true if such a partition was previously 
mounted r/w (this is what makes this difficult).

The most common situation where this is an issue are devices like home 
routers, which have their root fs on a ramdisk and only store 
configuration data on the disk. When the configuration is changed, the r/o 
mount is changed to r/w, the data is written, and then the mount is 
changed to r/o again.

So, the proposal was to

* add a DIOCCACHESYNC ioctl that can be used to flush a disk's cache

* add code to the file systems that executes this ioctl when a mount
  is updated from r/w to r/o

* change the various disk devices to do a cache flush whenever a
  writable physio file descriptor is closed on a partition. Right now
  this is only done if the last such file descriptor for a complete
  disk is closed.


There was some argument that the cache flush should not be done by the
filesystems but in a central place. The problem with that is that
currently the file systems do not notify anyone if a mount is changed
between r/o and r/w. So it's quite possible that a file system does
VOP_OPEN() and VOP_CLOSE() with different setting of F_WRITE. Or it
could be that despite the VOP_OPEN() call only having F_READ, the file
system is later changed to r/w.  So, if we wanted to go this way, we
would need a new call (VOP_UPDATE?) to change F_WRITE in the flags and
make all file systems use it..

Another issue voiced was the performance impact. I don't think that 
umounting or remounting file systems happens often enough for this to be a 
problem. For scsi it would actually be possible to do a cache flush only 
for a single partition, but for ata/nvme there is only an API for a cache 
flush for the whole disk.

I will re-send the patches that I have in separate mails.

Are there any other ideas how to go forward with this?

Cheers,
Stefan



Re: Suspend/Resume for nvme

2017-05-27 Thread Stefan Fritsch
On Fri, 26 May 2017, Claudio Jeker wrote:

> Testing it on my X270. I get:
> nvme0: unable to delete q, disabling
> 
> Apart from that it seems to work (eventhough without inteldrm not very
> helpful since I lose the display).

Thanks for testing.

We get called twice on suspend, once with DVACT_SUSPEND and once with 
DVACT_POWERDOWN. So, here is a patch that does it like in ahci.c an does 
everything in the DVACT_POWERDOWN path and nothing in the DVACT_SUSPEND 
path.

ok ?


--- sys/dev/ic/nvme.c
+++ sys/dev/ic/nvme.c
@@ -45,6 +45,7 @@ int   nvme_ready(struct nvme_softc *, u_int32_t);
 intnvme_enable(struct nvme_softc *, u_int);
 intnvme_disable(struct nvme_softc *);
 intnvme_shutdown(struct nvme_softc *);
+intnvme_resume(struct nvme_softc *);
 
 void   nvme_dumpregs(struct nvme_softc *);
 intnvme_identify(struct nvme_softc *, u_int);
@@ -68,6 +69,7 @@ void  nvme_empty_done(struct nvme_softc *, struct nvme_ccb *,
 struct nvme_queue *
nvme_q_alloc(struct nvme_softc *, u_int16_t, u_int, u_int);
 intnvme_q_create(struct nvme_softc *, struct nvme_queue *);
+intnvme_q_reset(struct nvme_softc *, struct nvme_queue *);
 intnvme_q_delete(struct nvme_softc *, struct nvme_queue *);
 void   nvme_q_submit(struct nvme_softc *,
struct nvme_queue *, struct nvme_ccb *,
@@ -264,7 +266,6 @@ nvme_attach(struct nvme_softc *sc)
struct scsibus_attach_args saa;
u_int64_t cap;
u_int32_t reg;
-   u_int dstrd;
u_int mps = PAGE_SHIFT;
 
mtx_init(>sc_ccb_mtx, IPL_BIO);
@@ -280,7 +281,7 @@ nvme_attach(struct nvme_softc *sc)
printf(", NVMe %d.%d\n", NVME_VS_MJR(reg), NVME_VS_MNR(reg));
 
cap = nvme_read8(sc, NVME_CAP);
-   dstrd = NVME_CAP_DSTRD(cap);
+   sc->sc_dstrd = NVME_CAP_DSTRD(cap);
if (NVME_CAP_MPSMIN(cap) > PAGE_SHIFT) {
printf("%s: NVMe minimum page size %u "
"is greater than CPU page size %u\n", DEVNAME(sc),
@@ -292,6 +293,7 @@ nvme_attach(struct nvme_softc *sc)
 
sc->sc_rdy_to = NVME_CAP_TO(cap);
sc->sc_mps = 1 << mps;
+   sc->sc_mps_bits = mps;
sc->sc_mdts = MAXPHYS;
sc->sc_max_sgl = 2;
 
@@ -300,7 +302,7 @@ nvme_attach(struct nvme_softc *sc)
return (1);
}
 
-   sc->sc_admin_q = nvme_q_alloc(sc, NVME_ADMIN_Q, 128, dstrd);
+   sc->sc_admin_q = nvme_q_alloc(sc, NVME_ADMIN_Q, 128, sc->sc_dstrd);
if (sc->sc_admin_q == NULL) {
printf("%s: unable to allocate admin queue\n", DEVNAME(sc));
return (1);
@@ -330,7 +332,7 @@ nvme_attach(struct nvme_softc *sc)
goto free_admin_q;
}
 
-   sc->sc_q = nvme_q_alloc(sc, 1, 128, dstrd);
+   sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
if (sc->sc_q == NULL) {
printf("%s: unable to allocate io q\n", DEVNAME(sc));
goto disable;
@@ -375,6 +377,47 @@ free_admin_q:
 }
 
 int
+nvme_resume(struct nvme_softc *sc)
+{
+   if (nvme_disable(sc) != 0) {
+   printf("%s: unable to disable controller\n", DEVNAME(sc));
+   return (1);
+   }
+
+   if (nvme_q_reset(sc, sc->sc_admin_q) != 0) {
+   printf("%s: unable to reset admin queue\n", DEVNAME(sc));
+   return (1);
+   }
+
+   if (nvme_enable(sc, sc->sc_mps_bits) != 0) {
+   printf("%s: unable to enable controller\n", DEVNAME(sc));
+   return (1);
+   }
+
+   sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
+   if (sc->sc_q == NULL) {
+   printf("%s: unable to allocate io q\n", DEVNAME(sc));
+   goto disable;
+   }
+
+   if (nvme_q_create(sc, sc->sc_q) != 0) {
+   printf("%s: unable to create io q\n", DEVNAME(sc));
+   goto free_q;
+   }
+
+   nvme_write4(sc, NVME_INTMC, 1);
+
+   return (0);
+
+free_q:
+   nvme_q_free(sc, sc->sc_q);
+disable:
+   nvme_disable(sc);
+
+   return (1);
+}
+
+int
 nvme_scsi_probe(struct scsi_link *link)
 {
struct nvme_softc *sc = link->adapter_softc;
@@ -469,6 +512,11 @@ nvme_activate(struct nvme_softc *sc, int act)
rv = config_activate_children(>sc_dev, act);
nvme_shutdown(sc);
break;
+   case DVACT_RESUME:
+   rv = nvme_resume(sc);
+   if (rv == 0)
+   rv = config_activate_children(>sc_dev, act);
+   break;
default:
rv = config_activate_children(>sc_dev, act);
break;
@@ -1079,6 +1127,8 @@ nvme_q_delete(struct nvme_softc *sc, struct nvme_queue *q)
if (rv != 0)
goto fail;
 
+   nvme_q_free(sc, q);
+
 fail:
scsi_io_put(>sc_iopool, ccb);
return (rv);
@@ -1208,6 +1258,7 @@ nvme_q_alloc(struct nvme_softc *sc, u_int16_t id, u_int 
entries, u_int dstrd)
mtx_init(>q_cq_mtx, IPL_BIO);
q->q_sqtdbl 

dhclient: request interface-mtu by default?

2017-05-26 Thread Stefan Fritsch
Hi,

does anyone know a specific reason why we should not request interface-mtu 
by default? It may be set by the DHCP server enable jumbo frames or to 
work around PMTU discovery breakage.

Cheers,
Stefan

--- a/sbin/dhclient/clparse.c
+++ b/sbin/dhclient/clparse.c
@@ -118,6 +118,8 @@ read_client_conf(struct interface_info *ifi)
[config->requested_option_count++] = DHO_BOOTFILE_NAME;
config->requested_options
[config->requested_option_count++] = DHO_TFTP_SERVER;
+   config->requested_options
+   [config->requested_option_count++] = DHO_INTERFACE_MTU;
 
if ((cfile = fopen(path_dhclient_conf, "r")) != NULL) {
do {



Suspend/Resume for nvme

2017-05-26 Thread Stefan Fritsch
Hi,

the following diff adds the missing bits to make suspend/resuem work with 
nvme. It was done by ehrhardt@ and tested against nvme.c 1.50, but seems 
to apply cleanly.

Unfortunately, I don't have any nvme hardware at the moment in order to 
test it. Is there anyone who could test it?

Cheers,
Stefan

diff --git sys/dev/ic/nvme.c sys/dev/ic/nvme.c
index 3c4b48fd822..e8d4af5059b 100644
--- sys/dev/ic/nvme.c
+++ sys/dev/ic/nvme.c
@@ -45,6 +45,7 @@ int   nvme_ready(struct nvme_softc *, u_int32_t);
 intnvme_enable(struct nvme_softc *, u_int);
 intnvme_disable(struct nvme_softc *);
 intnvme_shutdown(struct nvme_softc *);
+intnvme_resume(struct nvme_softc *);
 
 void   nvme_dumpregs(struct nvme_softc *);
 intnvme_identify(struct nvme_softc *, u_int);
@@ -68,6 +69,7 @@ void  nvme_empty_done(struct nvme_softc *, struct nvme_ccb *,
 struct nvme_queue *
nvme_q_alloc(struct nvme_softc *, u_int16_t, u_int, u_int);
 intnvme_q_create(struct nvme_softc *, struct nvme_queue *);
+intnvme_q_reset(struct nvme_softc *, struct nvme_queue *);
 intnvme_q_delete(struct nvme_softc *, struct nvme_queue *);
 void   nvme_q_submit(struct nvme_softc *,
struct nvme_queue *, struct nvme_ccb *,
@@ -264,7 +266,6 @@ nvme_attach(struct nvme_softc *sc)
struct scsibus_attach_args saa;
u_int64_t cap;
u_int32_t reg;
-   u_int dstrd;
u_int mps = PAGE_SHIFT;
 
mtx_init(>sc_ccb_mtx, IPL_BIO);
@@ -280,7 +281,7 @@ nvme_attach(struct nvme_softc *sc)
printf(", NVMe %d.%d\n", NVME_VS_MJR(reg), NVME_VS_MNR(reg));
 
cap = nvme_read8(sc, NVME_CAP);
-   dstrd = NVME_CAP_DSTRD(cap);
+   sc->sc_dstrd = NVME_CAP_DSTRD(cap);
if (NVME_CAP_MPSMIN(cap) > PAGE_SHIFT) {
printf("%s: NVMe minimum page size %u "
"is greater than CPU page size %u\n", DEVNAME(sc),
@@ -292,6 +293,7 @@ nvme_attach(struct nvme_softc *sc)
 
sc->sc_rdy_to = NVME_CAP_TO(cap);
sc->sc_mps = 1 << mps;
+   sc->sc_mps_bits = mps;
sc->sc_mdts = MAXPHYS;
sc->sc_max_sgl = 2;
 
@@ -300,7 +302,7 @@ nvme_attach(struct nvme_softc *sc)
return (1);
}
 
-   sc->sc_admin_q = nvme_q_alloc(sc, NVME_ADMIN_Q, 128, dstrd);
+   sc->sc_admin_q = nvme_q_alloc(sc, NVME_ADMIN_Q, 128, sc->sc_dstrd);
if (sc->sc_admin_q == NULL) {
printf("%s: unable to allocate admin queue\n", DEVNAME(sc));
return (1);
@@ -330,7 +332,7 @@ nvme_attach(struct nvme_softc *sc)
goto free_admin_q;
}
 
-   sc->sc_q = nvme_q_alloc(sc, 1, 128, dstrd);
+   sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
if (sc->sc_q == NULL) {
printf("%s: unable to allocate io q\n", DEVNAME(sc));
goto disable;
@@ -375,6 +377,47 @@ free_admin_q:
 }
 
 int
+nvme_resume(struct nvme_softc *sc)
+{
+   if (nvme_disable(sc) != 0) {
+   printf("%s: unable to disable controller\n", DEVNAME(sc));
+   return (1);
+   }
+
+   if (nvme_q_reset(sc, sc->sc_admin_q) != 0) {
+   printf("%s: unable to reset admin queue\n", DEVNAME(sc));
+   return (1);
+   }
+
+   if (nvme_enable(sc, sc->sc_mps_bits) != 0) {
+   printf("%s: unable to enable controller\n", DEVNAME(sc));
+   return (1);
+   }
+
+   sc->sc_q = nvme_q_alloc(sc, 1, 128, sc->sc_dstrd);
+   if (sc->sc_q == NULL) {
+   printf("%s: unable to allocate io q\n", DEVNAME(sc));
+   goto disable;
+   }
+
+   if (nvme_q_create(sc, sc->sc_q) != 0) {
+   printf("%s: unable to create io q\n", DEVNAME(sc));
+   goto free_q;
+   }
+
+   nvme_write4(sc, NVME_INTMC, 1);
+
+   return (0);
+
+free_q:
+   nvme_q_free(sc, sc->sc_q);
+disable:
+   nvme_disable(sc);
+
+   return (1);
+}
+
+int
 nvme_scsi_probe(struct scsi_link *link)
 {
struct nvme_softc *sc = link->adapter_softc;
@@ -469,6 +512,15 @@ nvme_activate(struct nvme_softc *sc, int act)
rv = config_activate_children(>sc_dev, act);
nvme_shutdown(sc);
break;
+   case DVACT_SUSPEND:
+   rv = config_activate_children(>sc_dev, act);
+   nvme_shutdown(sc);
+   break;
+   case DVACT_RESUME:
+   rv = nvme_resume(sc);
+   if (rv == 0)
+   rv = config_activate_children(>sc_dev, act);
+   break;
default:
rv = config_activate_children(>sc_dev, act);
break;
@@ -1079,6 +1131,8 @@ nvme_q_delete(struct nvme_softc *sc, struct nvme_queue *q)
if (rv != 0)
goto fail;
 
+   nvme_q_free(sc, q);
+
 fail:
scsi_io_put(>sc_iopool, ccb);
return (rv);
@@ -1208,6 +1262,7 @@ nvme_q_alloc(struct nvme_softc *sc, u_int16_t id, u_int 
entries, u_int dstrd)
 

Re: Relocate some of the vio(4) (re)init code into vio_init

2016-11-23 Thread Stefan Fritsch
On Wed, 23 Nov 2016, Mike Belopuhov wrote:

> This moves code that is supposed to be in init, but is in stop
> right now.  Tested on qemu.  OK?

The current code has some symmetry insofar that stop leaves the device in 
the same state as it is after attach. With your diff, the first vio_init() 
would find a different situation than the following calls. Not sure if one 
of the two variants is better than the other.

> 
> diff --git sys/dev/pci/if_vio.c sys/dev/pci/if_vio.c
> index d8c9798..68e6636 100644
> --- sys/dev/pci/if_vio.c
> +++ sys/dev/pci/if_vio.c
> @@ -662,14 +662,22 @@ vio_media_status(struct ifnet *ifp, struct ifmediareq 
> *imr)
>   */
>  int
>  vio_init(struct ifnet *ifp)
>  {
>   struct vio_softc *sc = ifp->if_softc;
> + struct virtio_softc *vsc = sc->sc_virtio;
>  
>   vio_stop(ifp, 0);
>   if_rxr_init(>sc_rx_ring, 2 * ((ifp->if_hardmtu / MCLBYTES) + 1),
>   sc->sc_vq[VQRX].vq_num);
> + virtio_reinit_start(vsc);
> + virtio_negotiate_features(vsc, vsc->sc_features, NULL);
> + virtio_start_vq_intr(vsc, >sc_vq[VQRX]);
> + virtio_stop_vq_intr(vsc, >sc_vq[VQTX]);
> + if (vsc->sc_nvqs >= 3)
> + virtio_start_vq_intr(vsc, >sc_vq[VQCTL]);
> + virtio_reinit_end(vsc);
>   vio_populate_rx_mbufs(sc);
>   ifp->if_flags |= IFF_RUNNING;
>   ifq_clr_oactive(>if_snd);
>   vio_iff(sc);
>   vio_link_state(ifp);
> @@ -693,17 +701,10 @@ vio_stop(struct ifnet *ifp, int disable)
>   vio_ctrleof(>sc_vq[VQCTL]);
>   vio_tx_drain(sc);
>   if (disable)
>   vio_rx_drain(sc);
>  
> - virtio_reinit_start(vsc);
> - virtio_negotiate_features(vsc, vsc->sc_features, NULL);
> - virtio_start_vq_intr(vsc, >sc_vq[VQRX]);
> - virtio_stop_vq_intr(vsc, >sc_vq[VQTX]);
> - if (vsc->sc_nvqs >= 3)
> - virtio_start_vq_intr(vsc, >sc_vq[VQCTL]);
> - virtio_reinit_end(vsc);
>   if (vsc->sc_nvqs >= 3) {
>   if (sc->sc_ctrl_inuse != FREE)
>   sc->sc_ctrl_inuse = RESET;
>   wakeup(>sc_ctrl_inuse);
>   }
> 
> 



Re: Fixup tsleeps in vio(4) and make them interruptable

2016-11-23 Thread Stefan Fritsch
On Wed, 23 Nov 2016, Mike Belopuhov wrote:
> This fixes up tsleep priorities in vio(4) and makes sleeps
> interruptable so that ifconfig can be killed with ^C.  Tested
> on qemu with a previous diff that makes vio(4) hang there.
> 
> OK?

I think the tsleep stuff is ok, but the vio_iff() changes are not.

> 
> diff --git sys/dev/pci/if_vio.c sys/dev/pci/if_vio.c
> index 97af2a2..d8c9798 100644
> --- sys/dev/pci/if_vio.c
> +++ sys/dev/pci/if_vio.c
> @@ -281,11 +281,11 @@ int vio_ctrl_rx(struct vio_softc *, int, int);
>  int  vio_set_rx_filter(struct vio_softc *);
>  void vio_iff(struct vio_softc *);
>  int  vio_media_change(struct ifnet *);
>  void vio_media_status(struct ifnet *, struct ifmediareq *);
>  int  vio_ctrleof(struct virtqueue *);
> -void vio_wait_ctrl(struct vio_softc *sc);
> +int  vio_wait_ctrl(struct vio_softc *sc);
>  int  vio_wait_ctrl_done(struct vio_softc *sc);
>  void vio_ctrl_wakeup(struct vio_softc *, enum vio_ctrl_state);
>  int  vio_alloc_mem(struct vio_softc *);
>  int  vio_alloc_dmamem(struct vio_softc *);
>  void vio_free_dmamem(struct vio_softc *);
> @@ -1219,11 +1219,12 @@ vio_ctrl_rx(struct vio_softc *sc, int cmd, int onoff)
>  
>   if (vsc->sc_nvqs < 3)
>   return ENOTSUP;
>  
>   splassert(IPL_NET);
> - vio_wait_ctrl(sc);
> + if ((r = vio_wait_ctrl(sc)) != 0)
> + return r;
>  
>   sc->sc_ctrl_cmd->class = VIRTIO_NET_CTRL_RX;
>   sc->sc_ctrl_cmd->command = cmd;
>   sc->sc_ctrl_rx->onoff = onoff;
>  
> @@ -1246,14 +1247,12 @@ vio_ctrl_rx(struct vio_softc *sc, int cmd, int onoff)
>   sizeof(*sc->sc_ctrl_rx), 1);
>   VIO_DMAMEM_ENQUEUE(sc, vq, slot, sc->sc_ctrl_status,
>   sizeof(*sc->sc_ctrl_status), 0);
>   virtio_enqueue_commit(vsc, vq, slot, 1);
>  
> - if (vio_wait_ctrl_done(sc)) {
> - r = EIO;
> + if ((r = vio_wait_ctrl_done(sc)) != 0)
>   goto out;
> - }
>  
>   VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_cmd,
>   sizeof(*sc->sc_ctrl_cmd), BUS_DMASYNC_POSTWRITE);
>   VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_rx,
>   sizeof(*sc->sc_ctrl_rx), BUS_DMASYNC_POSTWRITE);
> @@ -1271,28 +1270,38 @@ vio_ctrl_rx(struct vio_softc *sc, int cmd, int onoff)
>  out:
>   vio_ctrl_wakeup(sc, FREE);
>   return r;
>  }
>  
> -void
> +int
>  vio_wait_ctrl(struct vio_softc *sc)
>  {
> - while (sc->sc_ctrl_inuse != FREE)
> - tsleep(>sc_ctrl_inuse, IPL_NET, "vio_wait", 0);
> + int r = 0;
> +
> + while (sc->sc_ctrl_inuse != FREE) {
> + r = tsleep(>sc_ctrl_inuse, PRIBIO|PCATCH, "viowait", 0);
> + if (r == EINTR)
> + return r;
> + }
>   sc->sc_ctrl_inuse = INUSE;
> +
> + return r;
>  }
>  
>  int
>  vio_wait_ctrl_done(struct vio_softc *sc)
>  {
>   int r = 0;
> +
>   while (sc->sc_ctrl_inuse != DONE && sc->sc_ctrl_inuse != RESET) {
>   if (sc->sc_ctrl_inuse == RESET) {
> - r = 1;
> + r = EIO;
>   break;
>   }
> - tsleep(>sc_ctrl_inuse, IPL_NET, "vio_wait", 0);
> + r = tsleep(>sc_ctrl_inuse, PRIBIO|PCATCH, "viodone", 0);
> + if (r == EINTR)
> + break;
>   }
>   return r;
>  }
>  
>  void
> @@ -1334,11 +1343,12 @@ vio_set_rx_filter(struct vio_softc *sc)
>   splassert(IPL_NET);
>  
>   if (vsc->sc_nvqs < 3)
>   return ENOTSUP;
>  
> - vio_wait_ctrl(sc);
> + if ((r = vio_wait_ctrl(sc)) != 0)
> + return r;
>  
>   sc->sc_ctrl_cmd->class = VIRTIO_NET_CTRL_MAC;
>   sc->sc_ctrl_cmd->command = VIRTIO_NET_CTRL_MAC_TABLE_SET;
>  
>   VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_cmd,
> @@ -1364,14 +1374,12 @@ vio_set_rx_filter(struct vio_softc *sc)
>   sc->sc_ctrl_mac_tbl_mc->nentries * ETHER_ADDR_LEN, 1);
>   VIO_DMAMEM_ENQUEUE(sc, vq, slot, sc->sc_ctrl_status,
>   sizeof(*sc->sc_ctrl_status), 0);
>   virtio_enqueue_commit(vsc, vq, slot, 1);
>  
> - if (vio_wait_ctrl_done(sc)) {
> - r = EIO;
> + if ((r = vio_wait_ctrl_done(sc)) != 0)
>   goto out;
> - }
>  
>   VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_cmd,
>   sizeof(*sc->sc_ctrl_cmd), BUS_DMASYNC_POSTWRITE);
>   VIO_DMAMEM_SYNC(vsc, sc, sc->sc_ctrl_mac_info,
>   VIO_CTRL_MAC_INFO_SIZE, BUS_DMASYNC_POSTWRITE);
> @@ -1436,29 +1444,22 @@ vio_iff(struct vio_softc *sc)
>  
>   /* set unicast address, VirtualBox wants that */
>   memcpy(sc->sc_ctrl_mac_tbl_uc->macs[0], ac->ac_enaddr, ETHER_ADDR_LEN);
>   sc->sc_ctrl_mac_tbl_uc->nentries = 1;
>  
> - if (rxfilter) {
> + if (rxfilter)
>   sc->sc_ctrl_mac_tbl_mc->nentries = nentries;
> - r = vio_set_rx_filter(sc);
> - if (r != 0) {
> - rxfilter = 0;
> - allmulti = 1; /* fallback */
> - }
> - } else {
> - 

Re: vio(4): fixup crash on up/down

2016-11-23 Thread Stefan Fritsch
On Wed, 23 Nov 2016, Rafael Zalamena wrote:

> > Maybe something like this is enough already (untested):
> 
> I tried your diff without Mike's if_vio diff and it doesn't panic anymore,
> however it doesn't work.
> 
> vioX can send packets to host, host receives them and reply, but vioX
> doesn't see any packets back. I don't even need to touch the interface
> up/down status to see this happening. Also when the interface comes
> up after being shutdown it sends a bunch of packets to host.

Sorry, device_status is a bitmask, not a plain value.

Try the patch below. The first hunk is to fix the 'sends a bunch of 
packets'. If it causes any problems, leave it out.

diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index 93def73..6436e6a 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -703,6 +703,13 @@ virtio_net_io(int dir, uint16_t reg, uint32_t *data, 
uint8_t *intr,
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
dev->cfg.device_status = *data;
+   if (*data == 0) {
+   dev->vq[0].last_avail = 0;
+   dev->vq[0].notified_avail = 0;
+   dev->vq[1].last_avail = 0;
+   dev->vq[1].notified_avail = 0;
+   /* XXX do proper reset */
+   }
break;
default:
break;
@@ -796,6 +803,9 @@ vionet_enq_rx(struct vionet_dev *dev, char *pkt, ssize_t 
sz, int *spc)
 
ret = 0;
 
+   if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK))
+   return ret;
+
vr_sz = vring_size(VIONET_QUEUE_SIZE);
q_gpa = dev->vq[0].qa;
q_gpa = q_gpa * VIRTIO_PAGE_SIZE;



Re: vio(4): fixup crash on up/down

2016-11-23 Thread Stefan Fritsch
On Wed, 23 Nov 2016, Mike Belopuhov wrote:
> > I am not convinced. Doing a reset allows to recover from all kinds of 
> > problems with DOWN/UP. That was useful when we had bugs in the event_idx 
> > implementation.
> >
> 
> I don't think this justifies it since bugs need to be fixed regardless.

And vmd bugs should be fixed in vmd.



Re: vio(4): fixup crash on up/down

2016-11-23 Thread Stefan Fritsch
On Wed, 23 Nov 2016, Mike Belopuhov wrote:
> > I guess we could do that. But then we cannot free the mbufs on DOWN
> > until the device has used them.
> 
> Diff to this effect is below.  Works on vmd and qemu (original
> one didn't because I kept the virtio_reset).
> 
> > That sounds like an unnecessary waste of memory to me.
> > 
> 
> This is not so much memory we lose and then if you up it again
> you're going to have it all back.  We can revert to the present
> behavior once vmd matures, in the meantime people won't have to
> juggle diffs around in their trees :)

I am not convinced. Doing a reset allows to recover from all kinds of 
problems with DOWN/UP. That was useful when we had bugs in the event_idx 
implementation.

Also, I don't like to change code that is known to work with at least 4 
independent device implementations to work around problems in one 
incomplete implementation that we can easily change.

Maybe something like this is enough already (untested):

--- usr.sbin/vmd/virtio.c   2016-10-20 05:05:49.049943724 +0200
+++ usr.sbin/vmd/virtio.c   2016-11-23 08:55:38.829501275 +0100
@@ -796,6 +796,9 @@
 
ret = 0;
 
+   if (dev->cfg.device_status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)
+   return ret;
+
vr_sz = vring_size(VIONET_QUEUE_SIZE);
q_gpa = dev->vq[0].qa;
q_gpa = q_gpa * VIRTIO_PAGE_SIZE;



Re: vio(4): fixup crash on up/down

2016-11-22 Thread Stefan Fritsch
On Wednesday, 23 November 2016 00:34:42 CET Mike Belopuhov wrote:
> Yes, after looking closer at virtio code I agree with you.
> However, stop/init is purely OpenBSD specific action.  There
> are no provisions or requirements from the hardware really.
> Thus we can treat UP/DOWN as purely software state and don't
> stop/reinit the PCI device.

I guess we could do that. But then we cannot free the mbufs on DOWN until the 
device has used them. That sounds like an unnecessary waste of memory to me.




Re: Add ioctl for disk cache flush

2016-10-31 Thread Stefan Fritsch
On Sunday, 30 October 2016 19:06:57 CET Theo de Raadt wrote:
> > > From: David Gwynne 

> > > whats the use of the ioctl though? why do we need extra sync cache
> > > things?
> > 
> > Right.  This smells like papering over real bugs.

If there is a power loss or a usb disk device is removed, partitions that are, 
at that point of time, not mounted or are only mounted read-only should not 
suffer data loss. The only way to ensure this is flushing the disk cache when a 
r/w mount is unmounted or changed to r/o .

The most common situation where this is an issue are devices like home 
routers, which have their root fs on a ramdisk and only store configuration 
data on the disk. When the configuration is changed, the r/o mount is changed 
to r/w, the data is written, and then the mount is changed to r/o again.

> 
> Here's the thoughts I have exchanged with sf.
> 
> This issue should not be per filesystem, not even per partition.  It
> seems this should keep track of when any partition converts from RW ->
> R access (either via direct open, or via mount), and if so schedule
> the required sync operation (to occur sometime soon).  That way if
> many such RW->R transitions occur, they won't serialize and create a
> performance hit.

It was also pointed out by Theo that it would be nicer if we could have the 
code in one central place instead of having the cache flush in every file 
system.

Another thing that occurred to me during the discussion is that maybe we want 
to ensure that no more than one cache flush operation is in progress on a 
single device at any time. Having several cache flushes sent to the device in 
parallel may uncover interesting firmware bugs that we don't want to deal with.

Apart from that, I am not sure that the performance hit is noticeable. The 
only case where lots of file systems are unmounted or re-mounted r/o is at 
shutdown and reboot. And I don't think that even a dozen cache flushes will 
cause more than 0.1s delay if there is not much data to flush. Also, I don't 
think that it is enough if the cache flush is scheduled to occur soon. It must 
occur before the mount/umount system call returns to userland. In this case, I 
greatly prefer correctness over performance. 

So, I think that any solution to merge several cache flush operations will add 
quite a bit of complexity, is difficult to get right, and has little gain. 
Unless I am missing a use case where the performance impact is not negligible.

> Furthermore I don't believe userland should have access to this.
> We've made it 30 years without needing this, why do we need it now.

I don't see a case where userland needs access to this, but doing this as an 
ioctl is the simplest implementation and preventing userland from using it may 
be unneeded complexity.

Cheers,
Stefan



Re: Add ioctl for disk cache flush

2016-10-30 Thread Stefan Fritsch
On Sun, 30 Oct 2016, Stefan Fritsch wrote:
> I agree with all your comments (and should have reviewed the initial patch 
> better). Here is an updated diff.

The FWRITE check was missing in wdioctl(). Next try:

diff --git sys/dev/ata/wd.c sys/dev/ata/wd.c
index 5e2461f..ca5ac90 100644
--- sys/dev/ata/wd.c
+++ sys/dev/ata/wd.c
@@ -26,7 +26,7 @@
  */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -138,7 +138,7 @@ void  wdstart(void *);
 void  __wdstart(struct wd_softc*, struct buf *);
 void  wdrestart(void *);
 int   wd_get_params(struct wd_softc *, u_int8_t, struct ataparams *);
-void  wd_flushcache(struct wd_softc *, int);
+int   wd_flushcache(struct wd_softc *, int);
 void  wd_standby(struct wd_softc *, int);
 
 /* XXX: these should go elsewhere */
@@ -848,6 +848,14 @@ wdioctl(dev_t dev, u_long xfer, caddr_t addr, int flag, 
struct proc *p)
}
 #endif
 
+   case DIOCCACHESYNC:
+   if ((flag & FWRITE) == 0) {
+   error = EBADF;
+   goto exit;
+   }
+   error = wd_flushcache(wd, AT_WAIT);
+   goto exit;
+
default:
error = wdc_ioctl(wd->drvp, xfer, addr, flag, p);
goto exit;
@@ -1067,13 +1075,13 @@ wd_get_params(struct wd_softc *wd, u_int8_t flags, 
struct ataparams *params)
}
 }
 
-void
+int
 wd_flushcache(struct wd_softc *wd, int flags)
 {
struct wdc_command wdc_c;
 
if (wd->drvp->ata_vers < 4) /* WDCC_FLUSHCACHE is here since ATA-4 */
-   return;
+   return EIO;
bzero(_c, sizeof(struct wdc_command));
wdc_c.r_command = (wd->sc_flags & WDF_LBA48 ? WDCC_FLUSHCACHE_EXT :
WDCC_FLUSHCACHE);
@@ -1088,20 +1096,28 @@ wd_flushcache(struct wd_softc *wd, int flags)
if (wdc_exec_command(wd->drvp, _c) != WDC_COMPLETE) {
printf("%s: flush cache command didn't complete\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
+   if (wdc_c.flags & ERR_NODEV)
+   return ENODEV;
if (wdc_c.flags & AT_TIMEOU) {
printf("%s: flush cache command timeout\n",
wd->sc_dev.dv_xname);
+   return EIO;
+   }
+   if (wdc_c.flags & AT_ERROR) {
+   if (wdc_c.r_error == WDCE_ABRT) /* command not supported */
+   return ENODEV;
+   printf("%s: flush cache command: error 0x%x\n",
+   wd->sc_dev.dv_xname, wdc_c.r_error);
+   return EIO;
}
if (wdc_c.flags & AT_DF) {
printf("%s: flush cache command: drive fault\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
-   /*
-* Ignore error register, it shouldn't report anything else
-* than COMMAND ABORTED, which means the device doesn't support
-* flush cache
-*/
+   return 0;
 }
 
 void
diff --git sys/scsi/sd.c sys/scsi/sd.c
index 30fb36b..e5a0139 100644
--- sys/scsi/sd.c
+++ sys/scsi/sd.c
@@ -2,7 +2,7 @@
 /* $NetBSD: sd.c,v 1.111 1997/04/02 02:29:41 mycroft Exp $ */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -96,7 +96,7 @@ int   sd_vpd_block_limits(struct sd_softc *, int);
 intsd_vpd_thin(struct sd_softc *, int);
 intsd_thin_params(struct sd_softc *, int);
 intsd_get_parms(struct sd_softc *, struct disk_parms *, int);
-void   sd_flush(struct sd_softc *, int);
+intsd_flush(struct sd_softc *, int);
 
 void   viscpy(u_char *, u_char *, int);
 
@@ -999,6 +999,15 @@ sdioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, 
struct proc *p)
error = sd_ioctl_cache(sc, cmd, (struct dk_cache *)addr);
goto exit;
 
+   case DIOCCACHESYNC:
+   if (!ISSET(flag, FWRITE)) {
+   error = EBADF;
+   goto exit;
+   }
+   if ((sc->flags & SDF_DIRTY) != 0 || *(int *)addr != 0)
+   error = sd_flush(sc, 0);
+   goto exit;
+
default:
if (part != RAW_PART) {
error = ENOTTY;
@@ -1876,19 +1885,20 @@ die:
return (SDGP_RESULT_OFFLINE);
 }
 
-void
+int
 sd_flush(struct sd_softc *sc, int flags)
 {
struct scsi_link *link;
struct scsi_xfer *xs;
struct scsi_synchronize_cache *cmd;
+   int error;
 
if (sc->flags & SDF_DYING)
-   return;
+   return (ENXIO);
   

Re: Add ioctl for disk cache flush

2016-10-30 Thread Stefan Fritsch
On Mon, 17 Oct 2016, Alexander Bluhm wrote:
> > Part 1: Add an ioctl:
> > 
> > 
> > add DIOCCACHESYNC
> > 
> > Add an ioctl to tell storage devices to flush their internal caches.
> > Ported from netbsd by pedro@
> > 
> > OK?
> 
> Some remarks inline

Sorry for the long delay, I was very busy in the last weeks.

I agree with all your comments (and should have reviewed the initial patch 
better). Here is an updated diff.

Cheers,
Stefan

diff --git sys/dev/ata/wd.c sys/dev/ata/wd.c
index 5e2461f..a644cbf 100644
--- sys/dev/ata/wd.c
+++ sys/dev/ata/wd.c
@@ -26,7 +26,7 @@
  */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -138,7 +138,7 @@ void  wdstart(void *);
 void  __wdstart(struct wd_softc*, struct buf *);
 void  wdrestart(void *);
 int   wd_get_params(struct wd_softc *, u_int8_t, struct ataparams *);
-void  wd_flushcache(struct wd_softc *, int);
+int   wd_flushcache(struct wd_softc *, int);
 void  wd_standby(struct wd_softc *, int);
 
 /* XXX: these should go elsewhere */
@@ -848,6 +848,10 @@ wdioctl(dev_t dev, u_long xfer, caddr_t addr, int flag, 
struct proc *p)
}
 #endif
 
+   case DIOCCACHESYNC:
+   error = wd_flushcache(wd, AT_WAIT);
+   goto exit;
+
default:
error = wdc_ioctl(wd->drvp, xfer, addr, flag, p);
goto exit;
@@ -1067,13 +1071,13 @@ wd_get_params(struct wd_softc *wd, u_int8_t flags, 
struct ataparams *params)
}
 }
 
-void
+int
 wd_flushcache(struct wd_softc *wd, int flags)
 {
struct wdc_command wdc_c;
 
if (wd->drvp->ata_vers < 4) /* WDCC_FLUSHCACHE is here since ATA-4 */
-   return;
+   return EIO;
bzero(_c, sizeof(struct wdc_command));
wdc_c.r_command = (wd->sc_flags & WDF_LBA48 ? WDCC_FLUSHCACHE_EXT :
WDCC_FLUSHCACHE);
@@ -1088,20 +1092,28 @@ wd_flushcache(struct wd_softc *wd, int flags)
if (wdc_exec_command(wd->drvp, _c) != WDC_COMPLETE) {
printf("%s: flush cache command didn't complete\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
+   if (wdc_c.flags & ERR_NODEV)
+   return ENODEV;
if (wdc_c.flags & AT_TIMEOU) {
printf("%s: flush cache command timeout\n",
wd->sc_dev.dv_xname);
+   return EIO;
+   }
+   if (wdc_c.flags & AT_ERROR) {
+   if (wdc_c.r_error == WDCE_ABRT) /* command not supported */
+   return ENODEV;
+   printf("%s: flush cache command: error 0x%x\n",
+   wd->sc_dev.dv_xname, wdc_c.r_error);
+   return EIO;
}
if (wdc_c.flags & AT_DF) {
printf("%s: flush cache command: drive fault\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
-   /*
-* Ignore error register, it shouldn't report anything else
-* than COMMAND ABORTED, which means the device doesn't support
-* flush cache
-*/
+   return 0;
 }
 
 void
diff --git sys/scsi/sd.c sys/scsi/sd.c
index 30fb36b..e5a0139 100644
--- sys/scsi/sd.c
+++ sys/scsi/sd.c
@@ -2,7 +2,7 @@
 /* $NetBSD: sd.c,v 1.111 1997/04/02 02:29:41 mycroft Exp $ */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -96,7 +96,7 @@ int   sd_vpd_block_limits(struct sd_softc *, int);
 intsd_vpd_thin(struct sd_softc *, int);
 intsd_thin_params(struct sd_softc *, int);
 intsd_get_parms(struct sd_softc *, struct disk_parms *, int);
-void   sd_flush(struct sd_softc *, int);
+intsd_flush(struct sd_softc *, int);
 
 void   viscpy(u_char *, u_char *, int);
 
@@ -999,6 +999,15 @@ sdioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, 
struct proc *p)
error = sd_ioctl_cache(sc, cmd, (struct dk_cache *)addr);
goto exit;
 
+   case DIOCCACHESYNC:
+   if (!ISSET(flag, FWRITE)) {
+   error = EBADF;
+   goto exit;
+   }
+   if ((sc->flags & SDF_DIRTY) != 0 || *(int *)addr != 0)
+   error = sd_flush(sc, 0);
+   goto exit;
+
default:
if (part != RAW_PART) {
error = ENOTTY;
@@ -1876,19 +1885,20 @@ die:
return (SDGP_RESULT_OFFLINE);
 }
 
-void
+int
 sd_flush(struct sd_softc *sc, int flags)
 {
struct scsi_link *link;
struct scsi_xfer *xs;
struct scsi_synchronize_cache *cmd;
+   int error;
 
if (sc->flags & SDF_DYING)
-   return;
+   return (ENXIO);
link = 

Re: usb disk dirty after every reboot

2016-10-29 Thread Stefan Fritsch
On Thursday, 20 October 2016 20:55:47 CEST Lampshade wrote:
> What if somebody remounts
> from normal options
> (metadata: sync, data: async)
> to fully synced
> (metadata: sync, data: sync)
> 
> Example:
> # mount | grep home
> /dev/sd1h on /home type ffs (local, noatime, nodev, nosuid)
> # /sbin/mount -u -o sync,rw,nodev,nosuid,noatime /home
> # mount | grep home
> /dev/sd1h on /home type ffs (local, noatime, nodev, nosuid, synchronous)
> 
> Should this flush cache for this particular filesystem?

I don't think so because sync mounts make no guarantee with respect to the 
disk's write cache. But this is not a strong opinion.

Cheers,
Stefan



ffs/msdosfs: Flush cache when updating mount to R/O

2016-10-16 Thread Stefan Fritsch
On Sun, 16 Oct 2016, Stefan Fritsch wrote:
> > * When a R/W mount is updated to R/O. I will send patches for this in a 
> > separate mail.

Part 2: Use it


msdosfs & ffs: flush cache if updating mount to r/o

Other filesystems can be changed later.

ok?

diff --git sys/msdosfs/msdosfs_vfsops.c sys/msdosfs/msdosfs_vfsops.c
index e13b0b1..ae1a6d9 100644
--- sys/msdosfs/msdosfs_vfsops.c
+++ sys/msdosfs/msdosfs_vfsops.c
@@ -64,6 +64,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -128,8 +129,13 @@ msdosfs_mount(struct mount *mp, const char *path, void 
*data,
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
error = vflush(mp, NULLVP, flags);
-   if (!error)
+   if (!error) {
+   int force = 0;
+
pmp->pm_flags |= MSDOSFSMNT_RONLY;
+   /* may be not supported, ignore error */
+   VOP_IOCTL(pmp->pm_devvp, DIOCCACHESYNC, , 
 FWRITE, FSCRED, p);
+   }
}
if (!error && (mp->mnt_flag & MNT_RELOAD))
/* not yet implemented */
diff --git sys/ufs/ffs/ffs_vfsops.c sys/ufs/ffs/ffs_vfsops.c
index 99eaf52..abe5756 100644
--- sys/ufs/ffs/ffs_vfsops.c
+++ sys/ufs/ffs/ffs_vfsops.c
@@ -456,6 +456,16 @@ success:
fs->fs_flags &= ~FS_DOSOFTDEP;
}
ffs_sbupdate(ump, MNT_WAIT);
+   if (ronly) {
+   int force = 0;
+
+   /*
+* Updating mount to readonly. Try a cache flush.
+* Ignore error because the ioctl may not be supported.
+*/
+   VOP_IOCTL(ump->um_devvp, DIOCCACHESYNC, ,
+   FWRITE, FSCRED, p);
+   }
}
return (0);
 



Add ioctl for disk cache flush

2016-10-16 Thread Stefan Fritsch
On Sun, 16 Oct 2016, Stefan Fritsch wrote:
> * When a R/W mount is updated to R/O. I will send patches for this in a 
> separate mail.

Part 1: Add an ioctl:


add DIOCCACHESYNC

Add an ioctl to tell storage devices to flush their internal caches.
Ported from netbsd by pedro@

OK?

diff --git sys/dev/ata/wd.c sys/dev/ata/wd.c
index 5e2461f..280ec56 100644
--- sys/dev/ata/wd.c
+++ sys/dev/ata/wd.c
@@ -26,7 +26,7 @@
  */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -138,7 +138,7 @@ void  wdstart(void *);
 void  __wdstart(struct wd_softc*, struct buf *);
 void  wdrestart(void *);
 int   wd_get_params(struct wd_softc *, u_int8_t, struct ataparams *);
-void  wd_flushcache(struct wd_softc *, int);
+int   wd_flushcache(struct wd_softc *, int);
 void  wd_standby(struct wd_softc *, int);
 
 /* XXX: these should go elsewhere */
@@ -848,6 +848,10 @@ wdioctl(dev_t dev, u_long xfer, caddr_t addr, int flag, 
struct proc *p)
}
 #endif
 
+   /* XXX pedro: should set AT_WAIT according to force flag */
+   case DIOCCACHESYNC:
+   return wd_flushcache(wd, AT_WAIT);
+
default:
error = wdc_ioctl(wd->drvp, xfer, addr, flag, p);
goto exit;
@@ -1067,13 +1071,13 @@ wd_get_params(struct wd_softc *wd, u_int8_t flags, 
struct ataparams *params)
}
 }
 
-void
+int
 wd_flushcache(struct wd_softc *wd, int flags)
 {
struct wdc_command wdc_c;
 
if (wd->drvp->ata_vers < 4) /* WDCC_FLUSHCACHE is here since ATA-4 */
-   return;
+   return EIO;
bzero(_c, sizeof(struct wdc_command));
wdc_c.r_command = (wd->sc_flags & WDF_LBA48 ? WDCC_FLUSHCACHE_EXT :
WDCC_FLUSHCACHE);
@@ -1088,20 +1092,18 @@ wd_flushcache(struct wd_softc *wd, int flags)
if (wdc_exec_command(wd->drvp, _c) != WDC_COMPLETE) {
printf("%s: flush cache command didn't complete\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
-   if (wdc_c.flags & AT_TIMEOU) {
-   printf("%s: flush cache command timeout\n",
-   wd->sc_dev.dv_xname);
+   if (wdc_c.flags & AT_ERROR) {
+   if (wdc_c.r_error == WDCE_ABRT) /* command not supported */
+   return ENODEV;
}
-   if (wdc_c.flags & AT_DF) {
-   printf("%s: flush cache command: drive fault\n",
+   if (wdc_c.flags & (AT_ERROR | AT_TIMEOU | AT_DF)) {
+   printf("%s: flush cache command timeout\n",
wd->sc_dev.dv_xname);
+   return EIO;
}
-   /*
-* Ignore error register, it shouldn't report anything else
-* than COMMAND ABORTED, which means the device doesn't support
-* flush cache
-*/
+   return 0;
 }
 
 void
diff --git sys/scsi/sd.c sys/scsi/sd.c
index 30fb36b..8fe7375 100644
--- sys/scsi/sd.c
+++ sys/scsi/sd.c
@@ -2,7 +2,7 @@
 /* $NetBSD: sd.c,v 1.111 1997/04/02 02:29:41 mycroft Exp $ */
 
 /*-
- * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -96,7 +96,7 @@ int   sd_vpd_block_limits(struct sd_softc *, int);
 intsd_vpd_thin(struct sd_softc *, int);
 intsd_thin_params(struct sd_softc *, int);
 intsd_get_parms(struct sd_softc *, struct disk_parms *, int);
-void   sd_flush(struct sd_softc *, int);
+intsd_flush(struct sd_softc *, int);
 
 void   viscpy(u_char *, u_char *, int);
 
@@ -999,6 +999,15 @@ sdioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, 
struct proc *p)
error = sd_ioctl_cache(sc, cmd, (struct dk_cache *)addr);
goto exit;
 
+   case DIOCCACHESYNC:
+   if (!ISSET(flag, FWRITE)) {
+   error = EBADF;
+   goto exit;
+   }
+   if ((sc->flags & SDF_DIRTY) != 0 || *(int *)addr != 0)
+   error = sd_flush(sc, 0);
+   return (error);
+
default:
if (part != RAW_PART) {
error = ENOTTY;
@@ -1876,19 +1885,20 @@ die:
return (SDGP_RESULT_OFFLINE);
 }
 
-void
+int
 sd_flush(struct sd_softc *sc, int flags)
 {
struct scsi_link *link;
struct scsi_xfer *xs;
struct scsi_synchronize_cache *cmd;
+   int error;
 
if (sc->flags & SDF_DYING)
-   return;
+   return (ENXIO);
link = sc->sc_link;
 
if (link->quirks & SDEV_NOSYNCCACHE)
-   return;
+   return (0);
 
/*
 * Issue 

Re: usb disk dirty after every reboot

2016-10-16 Thread Stefan Fritsch
[moving to tech@]

On Tuesday, 20 September 2016 08:03:32 CEST Stefan Fritsch wrote:
> On Tue, 20 Sep 2016, Darren Tucker wrote:
> > On Tue, Sep 20, 2016 at 1:43 AM, Jan Stary <h...@stare.cz> wrote:
> > > This is current/i386 on an ALIX.1E (demsg below).
> > > I have an USB disk connected for /backup.
> > > 
> > > Upon every reboot, the filesystem on that disk is dirty:
> > > WARNING: R/W mount of /backup denied.  Filesystem is not clean - run
> > > fsck
> > 
> > I saw something similar on an APU where the root disk was on
> > (USB-attached) sdcard:
> > http://marc.info/?l=openbsd-misc=144237305322074=2
> > 
> > It seems to be a race.  There used to be a 4sec pause in the kernel
> > that was removed:
> > 
> > """
> > Remove 4 second delay on reboot/shutdown that was added 8 years
> > ago to "workaround MP timeout/splhigh/scsi race at reboot time".
> > """
> 
> I think before we re-add some arbitrary delays, we should check if we are
> actually sending an explicit cache flush command to the disk controllers.
> I have some code somewhere that does this for umount and mount -ur. I will
> look for it and send it to tech@, but probably not today.

I found a few cases, where we should send a cache flush but don't. 
Unfortunately, none of these cases explain the problem seen by Jan and Darren.

The cases I have found are:

* When a R/W mount is updated to R/O. I will send patches for this in a 
separate mail.

* When a R/W mount is unmounted but there is still another partition from the 
same disk mounted.

* When sync(2)  is called. Though I am not 100% sure if we really want to do a 
cache flush for every sync. Thoughts?

For the usb disk issue, some more debugging is necessary.

Cheers,
Stefan



Re: use x2apic if it is enabled by BIOS

2016-10-14 Thread Stefan Fritsch
On Friday, 14 October 2016 15:48:47 CEST Mark Kettenis wrote:
> > Date: Fri, 14 Oct 2016 16:49:31 +0900 (JST)
> > From: YASUOKA Masahiko 
> > 
> > Hi,
> > 
> > I'm working on NEC Express5800/R110h-1 (dmesg is attached).  On this
> > machine, our kernel panics with following message.
> > 
> >   cpu0 at mainbus0panic: cpu at apic id 0 already attached?
> > 
> > This seems to happen since x2APIC on the machine is enabled by BIOS
> > and the kernel doesn't assume that.  The diff makes the kernel use
> > x2APIC if it is enabled by BIOS.
> > 
> > ok?
> 
> is the APICBASE msr available on all amd64 machines?  If not, it will
> need to be protected with a CPUID check.

Yes. I have looked that up before, for its use in the suspend/resume code.

Stefan



Re: 64bit DMA on amd64

2016-07-11 Thread Stefan Fritsch
On Mon, 11 Jul 2016, Theo de Raadt wrote:
> > No, I didn't know that. I assumed that having a few more GBs of bufcache 
> > would help the performance. Until that is the case, 64bit dma does not 
> > make much sense.
> 
> BTW, my tests were on a 128GB sun4v machine.  Sun T5140.  They are
> actually fairly cheap used these days.
> 
> A maximum sized buffer cache should be fast.  However there is no need
> for it to be dma-reachable.  Bob's buffer cache flipper can bounce it
> to high memory easily after it is read the first time, and preserve it
> in otherwise unused memory.  A buffer cache object of that sort is
> never written back to the io path.  Also, it can be discarded in any
> memory shortage condition without cost.

But flipping buffers is not without cost. Especially for a SSD at rates of 
>200 MB/s (or even > 500 MB/s). With 64bit DMA, one could have a large 
buffer cache without this cost. But actual benchmarks would be required to 
see how relevant this is.



Re: 64bit DMA on amd64

2016-07-11 Thread Stefan Fritsch
On Mon, 11 Jul 2016, Theo de Raadt wrote:

> > Openbsd on amd64 assumes that DMA is only possible to the lower 4GB.
> 
> Not exactly.  On an architecture-by-architecture basis, OpenBSD is
> capable of insisting DMA reachable memory only lands in a smaller zone
> of memory -- because it makes the other layers of code easier.
> 
> > More interesting would be bufs and mbufs.
> 
> Why is it interesting for mbufs?  Please describe the environment
> where anywhere near that many mbufs make sense.
>
> And bufs don't need it either.  Have you actually cranked your buffer
> cache that high?  I have test this, on sparc64 which has unlimited DMA
> reach due to the iommu.  The system comes to a crawl when there are
> too many mbufs or bufs, probably due to management structures unable
> to handle the pressure.

No, I didn't know that. I assumed that having a few more GBs of bufcache 
would help the performance. Until that is the case, 64bit dma does not 
make much sense.

> 
> What is the usage case for this diff, if it cannot be enabled?
> 



Re: 64bit DMA on amd64

2016-07-11 Thread Stefan Fritsch
On Mon, 11 Jul 2016, Ted Unangst wrote:

> Stefan Fritsch wrote:
> > On Mon, 11 Jul 2016, Reyk Floeter wrote:
> > > The intentional 4GB limit is for forwarding: what if you forward mbufs 
> > > from a 64bit-capable interface to another one that doesn't support 64bit 
> > > DMA? And even if you would only enable it if all interfaces are 
> > > 64bit-capable, what if you plug in a 32bit USB/hotplug interface? We did 
> > > not want to support bounce buffers in OpenBSD.
> > 
> > Yes, I have understood that. My mail was more about non-mbuf DMA: Does it 
> > make sense to allow 64bit DMA in other cases while keeping the 4GB 
> > limitation for mbufs?
> 
> every kind of device can be attached via usb now. for the code that supports
> flipping, like bufcache, this is still tricky to handle dynamic limit changes.
> what happens to buffers marked DMA that suddenly aren't?

I guess the flipping would have to be done just before the device driver 
is called, but after it is clear which device driver will be called. Not 
sure if that is feasible or worth the effort. That's what I wanted to find 
out with my mail ;)

BTW, for usb devices, it probably depends on the host controller if 64bit 
dma is possible or not. I guess most xhci controllers will be able to do 
it.

> That said, I'm not 100% convinced the fear of bounce buffers is justified. If
> a USB device requires bouncing, it's already pretty slow. What are we
> optimizing for again?

True for spinning disks or usb storage sticks. But an SSD attached via USB 
3.x is not slow.

> Or something could be done to bring iommu to life.

The problem is that there are many systems that dont' have any. Or for 
openbsd in VMs, it may be expensive for the host system to emulate the 
iommu.


Cheers,
Stefan



Re: 64bit DMA on amd64

2016-07-11 Thread Stefan Fritsch
On Mon, 11 Jul 2016, Reyk Floeter wrote:
> The intentional 4GB limit is for forwarding: what if you forward mbufs 
> from a 64bit-capable interface to another one that doesn't support 64bit 
> DMA? And even if you would only enable it if all interfaces are 
> 64bit-capable, what if you plug in a 32bit USB/hotplug interface? We did 
> not want to support bounce buffers in OpenBSD.

Yes, I have understood that. My mail was more about non-mbuf DMA: Does it 
make sense to allow 64bit DMA in other cases while keeping the 4GB 
limitation for mbufs?

Cheers,
Stefan

> 
> Reyk
> 
> > On 11.07.2016, at 15:37, Stefan Fritsch <s...@sfritsch.de> wrote:
> > 
> > Hi,
> > 
> > following the discussion about mbufs, I have some questions about 64bit 
> > DMA in general.
> > 
> > Openbsd on amd64 assumes that DMA is only possible to the lower 4GB. But 
> > there are many devices (PCIe, virtio, ...) that can do DMA to the whole 
> > memory. Is it feasible to have known good devices opt-in into 64bit DMA?
> > 
> > I have done a patch that allows virtio to do 64bit DMA. This works insofar 
> > as the queues used by the device will now be allocated above 4GB. But this 
> > is only a small amount of memory. More interesting would be bufs and 
> > mbufs.
> > 
> > 
> > For bufs, Bob added some code for copying bufs above/below 4GB. But this 
> > code only has a single flag B_DMA to denote if DMA is possible into a buf 
> > or not. Would it make sense to replace that by a mechanism that is device 
> > specific, so that we can use devices efficiently that allow 64bit DMA? 
> > Maybe a flag in the device vnode?
> > 
> > 
> > Does it make sense to commit something like the diff below (not tested 
> > much), even if it saves at most a few MB below 4GB right now?
> > 
> > Cheers,
> > Stefan
> > 
> > 
> > diff --git sys/arch/amd64/amd64/bus_dma.c sys/arch/amd64/amd64/bus_dma.c
> > index 8eaa2e7..1aba7c0 100644
> > --- sys/arch/amd64/amd64/bus_dma.c
> > +++ sys/arch/amd64/amd64/bus_dma.c
> > @@ -293,6 +293,7 @@ _bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, 
> > bus_dma_segment_t *segs,
> > {
> > bus_addr_t paddr, baddr, bmask, lastaddr = 0;
> > bus_size_t plen, sgsize, mapsize;
> > +   struct uvm_constraint_range *constraint = t->_cookie;
> > int first = 1;
> > int i, seg = 0;
> > 
> > @@ -320,7 +321,7 @@ _bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, 
> > bus_dma_segment_t *segs,
> > if (plen < sgsize)
> > sgsize = plen;
> > 
> > -   if (paddr > dma_constraint.ucr_high)
> > +   if (paddr > constraint->ucr_high)
> > panic("Non dma-reachable buffer at paddr 
> > %#lx(raw)",
> > paddr);
> > 
> > @@ -405,15 +406,11 @@ _bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, 
> > bus_size_t alignment,
> > bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs,
> > int flags)
> > {
> > +   struct uvm_constraint_range *constraint = t->_cookie;
> > 
> > -   /*
> > -* XXX in the presence of decent (working) iommus and bouncebuffers
> > -* we can then fallback this allocation to a range of { 0, -1 }.
> > -* However for now  we err on the side of caution and allocate dma
> > -* memory under the 4gig boundary.
> > -*/
> > return (_bus_dmamem_alloc_range(t, size, alignment, boundary,
> > -   segs, nsegs, rsegs, flags, (bus_addr_t)0, (bus_addr_t)0x));
> > +   segs, nsegs, rsegs, flags, (bus_addr_t)constraint->ucr_low,
> > +   (bus_addr_t)constraint->ucr_high));
> > }
> > 
> > /*
> > @@ -567,6 +564,7 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t 
> > map, void *buf,
> > bus_size_t sgsize;
> > bus_addr_t curaddr, lastaddr, baddr, bmask;
> > vaddr_t vaddr = (vaddr_t)buf;
> > +   struct uvm_constraint_range *constraint = t->_cookie;
> > int seg;
> > pmap_t pmap;
> > 
> > @@ -584,7 +582,7 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t 
> > map, void *buf,
> >  */
> > pmap_extract(pmap, vaddr, (paddr_t *));
> > 
> > -   if (curaddr > dma_constraint.ucr_high)
> > +   if (curaddr > constraint->ucr_high)
> > panic("Non dma-reachable buffer at curaddr %#lx(raw)",
> > cura

64bit DMA on amd64

2016-07-11 Thread Stefan Fritsch
Hi,

following the discussion about mbufs, I have some questions about 64bit 
DMA in general.

Openbsd on amd64 assumes that DMA is only possible to the lower 4GB. But 
there are many devices (PCIe, virtio, ...) that can do DMA to the whole 
memory. Is it feasible to have known good devices opt-in into 64bit DMA?

I have done a patch that allows virtio to do 64bit DMA. This works insofar 
as the queues used by the device will now be allocated above 4GB. But this 
is only a small amount of memory. More interesting would be bufs and 
mbufs.


For bufs, Bob added some code for copying bufs above/below 4GB. But this 
code only has a single flag B_DMA to denote if DMA is possible into a buf 
or not. Would it make sense to replace that by a mechanism that is device 
specific, so that we can use devices efficiently that allow 64bit DMA? 
Maybe a flag in the device vnode?


Does it make sense to commit something like the diff below (not tested 
much), even if it saves at most a few MB below 4GB right now?

Cheers,
Stefan


diff --git sys/arch/amd64/amd64/bus_dma.c sys/arch/amd64/amd64/bus_dma.c
index 8eaa2e7..1aba7c0 100644
--- sys/arch/amd64/amd64/bus_dma.c
+++ sys/arch/amd64/amd64/bus_dma.c
@@ -293,6 +293,7 @@ _bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, 
bus_dma_segment_t *segs,
 {
bus_addr_t paddr, baddr, bmask, lastaddr = 0;
bus_size_t plen, sgsize, mapsize;
+   struct uvm_constraint_range *constraint = t->_cookie;
int first = 1;
int i, seg = 0;
 
@@ -320,7 +321,7 @@ _bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map, 
bus_dma_segment_t *segs,
if (plen < sgsize)
sgsize = plen;
 
-   if (paddr > dma_constraint.ucr_high)
+   if (paddr > constraint->ucr_high)
panic("Non dma-reachable buffer at paddr 
%#lx(raw)",
paddr);
 
@@ -405,15 +406,11 @@ _bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, 
bus_size_t alignment,
 bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs,
 int flags)
 {
+   struct uvm_constraint_range *constraint = t->_cookie;
 
-   /*
-* XXX in the presence of decent (working) iommus and bouncebuffers
-* we can then fallback this allocation to a range of { 0, -1 }.
-* However for now  we err on the side of caution and allocate dma
-* memory under the 4gig boundary.
-*/
return (_bus_dmamem_alloc_range(t, size, alignment, boundary,
-   segs, nsegs, rsegs, flags, (bus_addr_t)0, (bus_addr_t)0x));
+   segs, nsegs, rsegs, flags, (bus_addr_t)constraint->ucr_low,
+   (bus_addr_t)constraint->ucr_high));
 }
 
 /*
@@ -567,6 +564,7 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, 
void *buf,
bus_size_t sgsize;
bus_addr_t curaddr, lastaddr, baddr, bmask;
vaddr_t vaddr = (vaddr_t)buf;
+   struct uvm_constraint_range *constraint = t->_cookie;
int seg;
pmap_t pmap;
 
@@ -584,7 +582,7 @@ _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, 
void *buf,
 */
pmap_extract(pmap, vaddr, (paddr_t *));
 
-   if (curaddr > dma_constraint.ucr_high)
+   if (curaddr > constraint->ucr_high)
panic("Non dma-reachable buffer at curaddr %#lx(raw)",
curaddr);
 
diff --git sys/arch/amd64/amd64/machdep.c sys/arch/amd64/amd64/machdep.c
index de9f481..7640532 100644
--- sys/arch/amd64/amd64/machdep.c
+++ sys/arch/amd64/amd64/machdep.c
@@ -201,6 +201,12 @@ struct vm_map *phys_map = NULL;
 
 /* UVM constraint ranges. */
 struct uvm_constraint_range  isa_constraint = { 0x0, 0x00ffUL };
+   /*
+* XXX in the presence of decent (working) iommus and bouncebuffers
+* we can then fallback this allocation to a range of { 0, -1 }.
+* However for now  we err on the side of caution and allocate dma
+* memory under the 4gig boundary.
+*/
 struct uvm_constraint_range  dma_constraint = { 0x0, 0xUL };
 struct uvm_constraint_range *uvm_md_constraints[] = {
 _constraint,
diff --git sys/arch/amd64/include/pci_machdep.h 
sys/arch/amd64/include/pci_machdep.h
index 27b833b..bf54f31 100644
--- sys/arch/amd64/include/pci_machdep.h
+++ sys/arch/amd64/include/pci_machdep.h
@@ -41,6 +41,7 @@
  */
 
 extern struct bus_dma_tag pci_bus_dma_tag;
+extern struct bus_dma_tag virtio_pci_bus_dma_tag;
 
 /*
  * Types provided to machine-independent PCI code
diff --git sys/arch/amd64/isa/isa_machdep.c sys/arch/amd64/isa/isa_machdep.c
index 74dc907..ec35edead 100644
--- sys/arch/amd64/isa/isa_machdep.c
+++ sys/arch/amd64/isa/isa_machdep.c
@@ -140,7 +140,7 @@ void_isa_dma_free_bouncebuf(bus_dma_tag_t, 
bus_dmamap_t);
  * buffers, if necessary.
  */
 struct bus_dma_tag isa_bus_dma_tag = {
-   NULL,   

Re: [PATCH] let the mbufs use more then 4gb of memory

2016-06-25 Thread Stefan Fritsch
On Thursday 23 June 2016 14:41:53, Mark Kettenis wrote:
> We really don't want to implement bounce-buffers.  Adding IOMMU
> support is probably a better approach as it also brings some
> security benefits.  Not all amd64 hardware supports an IOMMU.  And
> hardware that does support it doesn't always have it enabled.  But
> for modern hardware an iommu is pretty much standard, except for
> the absolute low-end.  But those low-end machines tend to have only
> 2GB of memory anyway.

On amd64, modern would mean skylake or newer. At least until haswell 
(not sure about broadwell), Intel considered vt-d to be a high-end 
feature and many desktop CPUs don't have it enabled. It is easy to 
find systems with >=16 GB RAM without IOMMU.

Stefan



Re: static in ze kernel

2016-05-08 Thread Stefan Fritsch
On Friday 29 April 2016 14:09:23, Mark Kettenis wrote:
> I think we should simply continue the current practice of not using
> static in the kernel.  I mean, what is the benefit of abandoning
> that practice if you disable the optimizations that compiler would
> make anyway?

You get a clear separation of public and non-public interfaces. You 
can throw away hundreds of useless prototypes for local functions. You 
are less likely to acumulate multiple prototypes for the same function 
which are later not kept in sync and cause subtle bugs.

Not using static in the kernel is bad.



Re: Checking MAC address of incoming unicast packets

2016-01-21 Thread Stefan Fritsch
As there isn't any consensus for one of the other solutions, I am going to 
commit the patch that fixes this inside vio(4).

On Thu, 14 Jan 2016, Stefan Fritsch wrote:
> On Mon, 4 Jan 2016, Stefan Fritsch wrote:
> > On Sun, 3 Jan 2016, Theo de Raadt wrote:
> > > >> dlg writes:
> > > >> > should we just do it unconditionally? is there a downside to that?
> > > >
> > > >It may decrease performance a tiny bit. Since such bits tend to add 
> > > >up, I would be hesitant to enable the check for drivers that don't 
> > > >need it. OTOH, freebsd and linux seem to always do the check.
> > > 
> > > How does it decrease performance?
> > 
> > I am talking about this diff in ether_input():
> > 
> > diff --git sys/net/if_ethersubr.c sys/net/if_ethersubr.c
> > --- sys/net/if_ethersubr.c
> > +++ sys/net/if_ethersubr.c
> > @@ -353,8 +353,7 @@ ether_input(struct ifnet *ifp, struct mbuf *m, void 
> > *cookie)
> >  * If packet is unicast and we're in promiscuous mode, make sure it
> >  * is for us.  Drop otherwise.
> >  */
> > -   if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 &&
> > -   (ifp->if_flags & IFF_PROMISC)) {
> > +   if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
> > if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN)) {
> > m_freem(m);
> > return (1);
> > 
> > For other drivers than vio this means an additional 6 byte memcmp where 
> > one operand is already in the cache, and the other operand may or may not 
> > be in the cache. 'tiny performance impact' seems about right.
> 
> Hrvoje Popovski was very helpful and did some performance tests with the 
> patch above. There is only measurable difference at very high package 
> rates, but I must admit that there the difference is a bit larger than I 
> expected:
> 
> > i'm sending 64byte packets from host connected to ix2 and counting then
> > on host connected to ix3
> 
> > without your patch
> 
> sending receiving
> 400kpps 400kpps
> 600kpps 600kpps
> 800kpps 690kpps
> 1Mpps   610kpps
> 1.4Mpps 580kpps
> 12Mpps  580kpps
> 
> 
> > with your patch
> 
> sending receiving
> 400kpps 400kpps -   0%
> 600kpps 600kpps -   0%
> 800kpps 680kpps - 1.4%
> 1Mpps   600kpps - 1.6%
> 1.4Mpps 560kpps - 3.4%
> 12Mpps  560kpps - 3.4%
> 
> 
> 
> So, which variant should I commit?
> 
> - do the check unconditionally
> - add some flag so that ether_input() does the check for vio(4) 
> unconditionally
> - do the check in vio(4)
> 
> mpi fixed a similar bug in trunk(4) in July, but that does not use 
> ether_input() and would not profit from the flag approach.
> 
> 
> 
> > > Maybe you should show the diff which does the check in the driver
> > > itself, then then it will be easier to see the performance cost.  Right
> > > now I can't perceive the problem.
> > 
> > It's not very expensive. But it's more code duplication than I like:
> > 
> > diff --git sys/dev/pci/if_vio.c sys/dev/pci/if_vio.c
> > --- sys/dev/pci/if_vio.c
> > +++ sys/dev/pci/if_vio.c
> > @@ -1005,6 +1005,9 @@ vio_rxeof(struct vio_softc *sc)
> > int r = 0;
> > int slot, len, bufs_left;
> > struct virtio_net_hdr *hdr;
> > +   int promisc = (ifp->if_flags & IFF_PROMISC);
> > +   u_int8_t *enaddr = sc->sc_ac.ac_enaddr;
> > +   struct ether_header *eh;
> >  
> > while (virtio_dequeue(vsc, vq, , ) == 0) {
> > r = 1;
> > @@ -1035,10 +1038,33 @@ vio_rxeof(struct vio_softc *sc)
> > bufs_left--;
> > }
> >  
> > -   if (bufs_left == 0) {
> > -   ml_enqueue(, m0);
> > -   m0 = NULL;
> > +   if (bufs_left > 0)
> > +   continue;
> > +
> > +   /*
> > +* Unfortunately, mac filtering is only best effort
> > +* in virtio-net. Unwanted packets may still arrive.
> > +* If we are not promiscous, we have to check if the
> > +* packet is actually for us.
> > +*/
> > +   if (!promisc) {
> > +   m0 = m_pullup(m0, ETHER_HDR_LEN);
> > +   /*
> > +* no need to check m0 == NULL, we know m0 has enough
> > +* space
> > +*/
> > +
> > +   eh = mtod(m0, struct ether_header *);
> > +   if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
> > +   memcmp(enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 
> > 0) {
> > +   m_freem(m0);
> > +   m0 = NULL;
> > +   continue;
> > +   }
> > }
> > +
> > +   ml_enqueue(, m0);
> > +   m0 = NULL;
> > }
> > if (m0 != NULL) {
> > DBGPRINT("expected %d buffers, got %d", (int)hdr->num_buffers,
> > 
> > 
> 
> 



Re: Checking MAC address of incoming unicast packets

2016-01-14 Thread Stefan Fritsch
On Mon, 4 Jan 2016, Stefan Fritsch wrote:
> On Sun, 3 Jan 2016, Theo de Raadt wrote:
> > >> dlg writes:
> > >> > should we just do it unconditionally? is there a downside to that?
> > >
> > >It may decrease performance a tiny bit. Since such bits tend to add 
> > >up, I would be hesitant to enable the check for drivers that don't 
> > >need it. OTOH, freebsd and linux seem to always do the check.
> > 
> > How does it decrease performance?
> 
> I am talking about this diff in ether_input():
> 
> diff --git sys/net/if_ethersubr.c sys/net/if_ethersubr.c
> --- sys/net/if_ethersubr.c
> +++ sys/net/if_ethersubr.c
> @@ -353,8 +353,7 @@ ether_input(struct ifnet *ifp, struct mbuf *m, void 
> *cookie)
>* If packet is unicast and we're in promiscuous mode, make sure it
>* is for us.  Drop otherwise.
>*/
> - if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 &&
> - (ifp->if_flags & IFF_PROMISC)) {
> + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
>   if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN)) {
>   m_freem(m);
>   return (1);
> 
> For other drivers than vio this means an additional 6 byte memcmp where 
> one operand is already in the cache, and the other operand may or may not 
> be in the cache. 'tiny performance impact' seems about right.

Hrvoje Popovski was very helpful and did some performance tests with the 
patch above. There is only measurable difference at very high package 
rates, but I must admit that there the difference is a bit larger than I 
expected:

> i'm sending 64byte packets from host connected to ix2 and counting then
> on host connected to ix3

> without your patch

sending receiving
400kpps 400kpps
600kpps 600kpps
800kpps 690kpps
1Mpps   610kpps
1.4Mpps 580kpps
12Mpps  580kpps


> with your patch

sending receiving
400kpps 400kpps -   0%
600kpps 600kpps -   0%
800kpps 680kpps - 1.4%
1Mpps   600kpps - 1.6%
1.4Mpps 560kpps - 3.4%
12Mpps  560kpps - 3.4%



So, which variant should I commit?

- do the check unconditionally
- add some flag so that ether_input() does the check for vio(4) 
unconditionally
- do the check in vio(4)

mpi fixed a similar bug in trunk(4) in July, but that does not use 
ether_input() and would not profit from the flag approach.



> > Maybe you should show the diff which does the check in the driver
> > itself, then then it will be easier to see the performance cost.  Right
> > now I can't perceive the problem.
> 
> It's not very expensive. But it's more code duplication than I like:
> 
> diff --git sys/dev/pci/if_vio.c sys/dev/pci/if_vio.c
> --- sys/dev/pci/if_vio.c
> +++ sys/dev/pci/if_vio.c
> @@ -1005,6 +1005,9 @@ vio_rxeof(struct vio_softc *sc)
>   int r = 0;
>   int slot, len, bufs_left;
>   struct virtio_net_hdr *hdr;
> + int promisc = (ifp->if_flags & IFF_PROMISC);
> + u_int8_t *enaddr = sc->sc_ac.ac_enaddr;
> + struct ether_header *eh;
>  
>   while (virtio_dequeue(vsc, vq, , ) == 0) {
>   r = 1;
> @@ -1035,10 +1038,33 @@ vio_rxeof(struct vio_softc *sc)
>   bufs_left--;
>   }
>  
> - if (bufs_left == 0) {
> - ml_enqueue(, m0);
> - m0 = NULL;
> + if (bufs_left > 0)
> + continue;
> +
> + /*
> +  * Unfortunately, mac filtering is only best effort
> +  * in virtio-net. Unwanted packets may still arrive.
> +  * If we are not promiscous, we have to check if the
> +  * packet is actually for us.
> +  */
> + if (!promisc) {
> + m0 = m_pullup(m0, ETHER_HDR_LEN);
> + /*
> +  * no need to check m0 == NULL, we know m0 has enough
> +  * space
> +  */
> +
> + eh = mtod(m0, struct ether_header *);
> + if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
> + memcmp(enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 
> 0) {
> + m_freem(m0);
> + m0 = NULL;
> + continue;
> + }
>   }
> +
> + ml_enqueue(, m0);
> + m0 = NULL;
>   }
>   if (m0 != NULL) {
>   DBGPRINT("expected %d buffers, got %d", (int)hdr->num_buffers,
> 
> 



Re: Xen virtual network (Netfront) driver

2016-01-12 Thread Stefan Fritsch
On Thu, 7 Jan 2016, Mike Belopuhov wrote:

> On 7 January 2016 at 13:17, Mark Kettenis <mark.kette...@xs4all.nl> wrote:
> >> From: Mike Belopuhov <m...@belopuhov.com>
> >> Date: Thu, 7 Jan 2016 12:02:23 +0100
> >>
> >> On 6 January 2016 at 17:58, Stefan Fritsch <s...@sfritsch.de> wrote:
> >> > On Wed, 6 Jan 2016, Mike Belopuhov wrote:
> >> >
> >> >> There's still stuff to do, but it receives and transmits reliably
> >> >> (at least on modern Xen) so I'd like to get it in.  Man page will
> >> >> follow.
> >> >
> >> > I only had a quick glance at the code, but I have one comment about your
> >> > use of memory barriers. The membar_* macros are pure compiler barriers
> >> > when the openbsd kernel is compiled for UP. But since the host machine 
> >> > and
> >> > xen may use SMP even in this case, I suspect the that you need hardware
> >> > memory barriers even if MULTIPROCESSOR is not defined. This does not seem
> >> > relevant for x86 because you don't use membar_sync, but it may become
> >> > relevant for arm, which is also supported by xen.
> >> >
> >>
> >> membar_{producer,consumer} are defined on arm to perform store and
> >> load memory barriers.  Our arm code currently does not distinguish
> >> between an MP case and non-MP case regarding the definition of these
> >> macros, so I'm not entirely certain what are you trying to say.

I didn't check arm's implementation but new that it had non-empty 
membar_{producer,consumer}. So, if it does not distinguish between an MP 
case and non-MP case, then there is no problem there. But maybe you should 
document somewhere which assumptions about the architecture you make, so 
that they can be checked when adding a new architecture. I guess arm64 
will come sooner or later and I don't know if it has exactly the same 
memory model as 32bit arm.

> >
> > Not sure ARM is a good example to look at.
> >
> 
> The only architectures that Xen dom0 is implemented for are i386,
> adm64 and arm, so there's no real need to look at anything else.
> 
> > In principle I think that the membar_xxx() interfaces could be simple
> > compiler barriers on all our architectures, at least as long as the
> > CPU will observe its own stores in the same order as they were
> > emitted.  But I think all sane CPU architectures make those
> > guarantees.  At least for "normal" memory.  However, we treat that as
> > an optimization.  And we haven't done that for all our architectures.
> >
> > The problem with virtualization is of course that even a non-MP kernel
> > is actually running in an MP environment.  If data structures are
> > shared with the hypervisor or another domain running on a different
> > CPU, proper memory barriers must be used to guarantee the other side
> > sees our stores in the right order.  The typical case would be
> > populating a descriptor with some sort of validity bit.  There you
> > want to make sure the other side doesn't see the valid bit set until
> > all the other parts of the descriptor have been filled in and are
> > visible.  In that case a simple compiler barrier may not be enough.

Yes. With intel it's the "Reads may be reordered with older writes to 
different locations but not with older writes to the same location" bit 
from the memory model that is causing problems. So you have to check if 
xen hits this case. virtio does (and removing the membarriers causes 
observable hangs).

> 
> That's what I was referring to in my example below.
> 
> > This is why the virtio_membar_xxx() primitives were introduced.
> >
> 
> Any idea why wasn't store and load barriers implemented separately?

No idea. virtio_membar_xxx() was modeled after the existing membar_xxx(). 
But AIUI membar_consumer() plus membar_producer() is not equivalent to 
membar_sync() (which also prevents read vs. write reordering).



Re: Xen virtual network (Netfront) driver

2016-01-06 Thread Stefan Fritsch
On Wed, 6 Jan 2016, Mike Belopuhov wrote:

> There's still stuff to do, but it receives and transmits reliably
> (at least on modern Xen) so I'd like to get it in.  Man page will
> follow.

I only had a quick glance at the code, but I have one comment about your 
use of memory barriers. The membar_* macros are pure compiler barriers 
when the openbsd kernel is compiled for UP. But since the host machine and 
xen may use SMP even in this case, I suspect the that you need hardware 
memory barriers even if MULTIPROCESSOR is not defined. This does not seem 
relevant for x86 because you don't use membar_sync, but it may become 
relevant for arm, which is also supported by xen.

I had the same problem in virtio and introduced the virtio_membar_* macros 
for this purpose. Maybe they should be renamed to a more generic name and 
you should use them, too?


> + if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(>if_snd))
> + return;
> +
> + prod = txr->txr_prod;
> + membar_consumer();
> +
> + for (;;) {
> + m = ifq_deq_begin(>if_snd);
> + if (m == NULL)
> + break;
> +
> + error = xnf_encap(sc, m, );
> + if (error == ENOENT) {
> + /* transient */
> + ifq_deq_rollback(>if_snd, m);
> + ifq_set_oactive(>if_snd);
> + break;
> + } else if (error) {
> + /* the chain is too large */
> + ifq_deq_commit(>if_snd, m);
> + m_freem(m);
> + continue;
> + }
> + ifq_deq_commit(>if_snd, m);
> +
> +#if NBPFILTER > 0
> + if (ifp->if_bpf)
> + bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT);
> +#endif
> + pkts++;
> + }
> + if (pkts > 0) {
> + txr->txr_prod = prod;
> + xen_intr_signal(sc->sc_xih);
> + }
> +}
> +
> +int
> +xnf_encap(struct xnf_softc *sc, struct mbuf *m, uint32_t *prod)
> +{
> + struct ifnet *ifp = >sc_ac.ac_if;
> + struct xnf_tx_ring *txr = sc->sc_tx_ring;
> + union xnf_tx_desc *txd;
> + bus_dmamap_t dmap;
> + int error, i, n = 0;
> +
> + if (((txr->txr_cons - *prod - 1) & (XNF_TX_DESC - 1)) < XNF_TX_FRAG) {
> + error = ENOENT;
> + goto errout;
> + }
> +
> + i = *prod & (XNF_TX_DESC - 1);
> + dmap = sc->sc_tx_dmap[i];
> +
> + error = bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
> + BUS_DMA_NOWAIT);
> + if (error == EFBIG) {
> + if (m_defrag(m, M_DONTWAIT) ||
> + bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
> +  BUS_DMA_NOWAIT))
> + goto errout;
> + } else if (error)
> + goto errout;
> +
> + for (n = 0; n < dmap->dm_nsegs; n++, (*prod)++) {
> + i = *prod & (XNF_TX_DESC - 1);
> + if (sc->sc_tx_buf[i])
> + panic("%s: save vs spell: %d\n", ifp->if_xname, i);
> + txd = >txr_desc[i];
> + if (n == 0) {
> + sc->sc_tx_buf[i] = m;
> + if (0 && m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
> + txd->txd_req.txq_flags = XNF_TXF_CSUM |
> + XNF_TXF_VALID;
> + txd->txd_req.txq_size = m->m_pkthdr.len;
> + } else
> + txd->txd_req.txq_size = dmap->dm_segs[n].ds_len;
> + if (n != dmap->dm_nsegs - 1)
> + txd->txd_req.txq_flags |= XNF_TXF_CHUNK;
> + txd->txd_req.txq_ref = dmap->dm_segs[n].ds_addr;
> + txd->txd_req.txq_offset = dmap->dm_segs[n].ds_offset;
> + }
> +
> + ifp->if_opackets++;
> + return (0);
> +
> + errout:
> + ifp->if_oerrors++;
> + return (error);
> +}
> +
> +void
> +xnf_intr(void *arg)
> +{
> + struct xnf_softc *sc = arg;
> + struct ifnet *ifp = >sc_ac.ac_if;
> +
> + if (ifp->if_flags & IFF_RUNNING) {
> + xnf_rxeof(sc);
> + xnf_txeof(sc);
> + }
> +}
> +
> +int
> +xnf_txeof(struct xnf_softc *sc)
> +{
> + struct ifnet *ifp = >sc_ac.ac_if;
> + struct xnf_tx_ring *txr = sc->sc_tx_ring;
> + union xnf_tx_desc *txd;
> + struct mbuf *m;
> + bus_dmamap_t dmap;
> + volatile uint32_t r;
> + uint32_t cons;
> + int i, id, pkts = 0;
> +
> + do {
> + for (cons = sc->sc_tx_cons; cons != txr->txr_cons; cons++) {
> + membar_consumer();
> + i = cons & (XNF_TX_DESC - 1);
> + txd = >txr_desc[i];
> + id = txd->txd_rsp.txp_id;
> + memset(txd, 0, sizeof(*txd));
> + txd->txd_req.txq_id = id;
> + membar_producer();
> + if (sc->sc_tx_buf[i]) {
> +  

Re: Checking MAC address of incoming unicast packets

2016-01-04 Thread Stefan Fritsch
On Sun, 3 Jan 2016, Theo de Raadt wrote:

> >On Friday 01 January 2016 16:25:59, Theo de Raadt wrote:
> >> dlg writes:
> >> > should we just do it unconditionally? is there a downside to that?
> >
> >It may decrease performance a tiny bit. Since such bits tend to add 
> >up, I would be hesitant to enable the check for drivers that don't 
> >need it. OTOH, freebsd and linux seem to always do the check.
> 
> How does it decrease performance?

I am talking about this diff in ether_input():

diff --git sys/net/if_ethersubr.c sys/net/if_ethersubr.c
--- sys/net/if_ethersubr.c
+++ sys/net/if_ethersubr.c
@@ -353,8 +353,7 @@ ether_input(struct ifnet *ifp, struct mbuf *m, void *cookie)
 * If packet is unicast and we're in promiscuous mode, make sure it
 * is for us.  Drop otherwise.
 */
-   if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 &&
-   (ifp->if_flags & IFF_PROMISC)) {
+   if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
if (memcmp(ac->ac_enaddr, eh->ether_dhost, ETHER_ADDR_LEN)) {
m_freem(m);
return (1);

For other drivers than vio this means an additional 6 byte memcmp where 
one operand is already in the cache, and the other operand may or may not 
be in the cache. 'tiny performance impact' seems about right.

> 
> I tried to read the rest of what you wrote.  It makes no sense.  Since
> these mbufs are coming off a ring, aren't they already linear; aren't
> these mbufs already normalzed ie. "pulled up".  The performance cost
> seems to be one condition in a mbuf macro which decides to do no work.
> 
> Maybe you should show the diff which does the check in the driver
> itself, then then it will be easier to see the performance cost.  Right
> now I can't perceive the problem.

It's not very expensive. But it's more code duplication than I like:

diff --git sys/dev/pci/if_vio.c sys/dev/pci/if_vio.c
--- sys/dev/pci/if_vio.c
+++ sys/dev/pci/if_vio.c
@@ -1005,6 +1005,9 @@ vio_rxeof(struct vio_softc *sc)
int r = 0;
int slot, len, bufs_left;
struct virtio_net_hdr *hdr;
+   int promisc = (ifp->if_flags & IFF_PROMISC);
+   u_int8_t *enaddr = sc->sc_ac.ac_enaddr;
+   struct ether_header *eh;
 
while (virtio_dequeue(vsc, vq, , ) == 0) {
r = 1;
@@ -1035,10 +1038,33 @@ vio_rxeof(struct vio_softc *sc)
bufs_left--;
}
 
-   if (bufs_left == 0) {
-   ml_enqueue(, m0);
-   m0 = NULL;
+   if (bufs_left > 0)
+   continue;
+
+   /*
+* Unfortunately, mac filtering is only best effort
+* in virtio-net. Unwanted packets may still arrive.
+* If we are not promiscous, we have to check if the
+* packet is actually for us.
+*/
+   if (!promisc) {
+   m0 = m_pullup(m0, ETHER_HDR_LEN);
+   /*
+* no need to check m0 == NULL, we know m0 has enough
+* space
+*/
+
+   eh = mtod(m0, struct ether_header *);
+   if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
+   memcmp(enaddr, eh->ether_dhost, ETHER_ADDR_LEN) != 
0) {
+   m_freem(m0);
+   m0 = NULL;
+   continue;
+   }
}
+
+   ml_enqueue(, m0);
+   m0 = NULL;
}
if (m0 != NULL) {
DBGPRINT("expected %d buffers, got %d", (int)hdr->num_buffers,



Re: Checking MAC address of incoming unicast packets

2016-01-04 Thread Stefan Fritsch
On Mon, 4 Jan 2016, Claudio Jeker wrote:

> On Sat, Jan 02, 2016 at 04:04:33PM +0100, Mark Kettenis wrote:
> > > Date: Sat, 2 Jan 2016 10:57:41 +0100
> > > From: Martin Pieuchot 
> > > 
> > > If it's acceptable performance-wise to do the check unconditionally I
> > > believe that's the way to go.  If not I'm a bit afraid of introducing
> > > a flag/capability for a single driver.  Do you know if any other driver
> > > could use it?
> > 
> > I suppose any paravirtualized network driver would need this; i.e. my
> > sun4v vnet(4) driver and the xen stuff mikeb@ is working on.
> > 

That really depends on the device. Not all need to have the same weird 
semantics as virtio. But I wouldn't rule it out, either.

> Shouldn't all these interfaces have the PROMISC flag set all the time
> instead? They seem to be always promiscous since they don't have a HW mac
> address filter. Doing that would solve the problem in a simple way
> (special ioctl handling per driver).

No, it's not the same as always promiscous. The virtio specification says 
"there is a filter, but unwanted packets may still arrive". The 
reliability of the HW mac address filter depends on the implementation. 
Some have a reliable filter (e.g. the one in qemu), some rely on the 
software bridge (e.g. the vhost-net implementation in the linux kernel). 
In order to reliably get all packets, the driver has to explicitly tell 
the device to switch PROMISC mode on.



Re: Checking MAC address of incoming unicast packets

2016-01-02 Thread Stefan Fritsch
On Friday 01 January 2016 16:25:59, Theo de Raadt wrote:
> dlg writes:
> > should we just do it unconditionally? is there a downside to that?

It may decrease performance a tiny bit. Since such bits tend to add 
up, I would be hesitant to enable the check for drivers that don't 
need it. OTOH, freebsd and linux seem to always do the check.

> 
> That is a very good question.  What are the downsides against having
> the driver do this filtering itself, like all real hardware does?

The driver would need to do some mbuf operations to get the 
destination address. The same operations are then done again inside 
ether_input(). This is ugly.


> Why risk sending packets of the wrong form further upwards into the
> network stack, and then having to ensure the checks exist up there?
> 
> This flag is requesting special service to encourage a layer
> violation.
> 
> What is the reasoning to encourage such a flag (which someone will
> one day see, misuse, and try to expand for their bizzare use
> case...)

To me, it seems as much a layer violation as hardware checksum 
offloading features or the IFF_SIMPLEX flag. So no change there.


On Saturday 02 January 2016 10:57:41, Martin Pieuchot wrote:
> If it's acceptable performance-wise to do the check unconditionally
> I believe that's the way to go.  If not I'm a bit afraid of
> introducing a flag/capability for a single driver.  Do you know if
> any other driver could use it?

I haven't found specs of xen or vmware network drivers, therefore I 
can't say if those would have the same issue.

One point for doing the check unconditionally is that it may guard 
against buggy hardware or buggy hypervisors (in cases the spec does 
not allow this). Since the observable error in our case was that 
sometimes TCP connections of other, unrelated VM hosts were aborted 
with TCP RSTs, such bugs are very difficult to find if they are not 
triggered often.

I don't have a setup myself where I could do meaningful performance 
tests. If anyone has a setup where he can easily do such tests, it 
would be nice if he could test if removing the "(ifp->if_flags & 
IFF_PROMISC)" check in ether_input() makes any difference. Otherwise, 
I can try to do some performance tests at work, but that may take a 
while.



Checking MAC address of incoming unicast packets

2016-01-01 Thread Stefan Fritsch
Hi,

by default, the ether_input() checks the destination MAC address of 
incoming unicast packets only if the interface is in promiscous mode. If 
not, it is assumed that the NIC filters unicast packets reliably. 
Unfortunately, for virtio-net this is not the case. There, unicast 
filtering is only best effort, and (depending on configuration) if the 
bridge on the VM host does unicast flodding, unicast packets that are not 
for the VM guest may still be delivered to the VM guest. This is a rather 
annoying problem because it can cause pf to send RST packets to foreign 
connections. (Kudos to mpf@ for debugging this).

There are two possible approaches to fix this problem. Either make the 
vio(4) driver filter out unicast packets that are not for the local MAC, 
which would involve duplicating quite a bit of code from ether_input() in 
vio(4). Or, and I would prefer this, allow the driver to tell 
ether_input() that it needs to check the MAC always, and not only if the 
interface is in promiscous mode.

This could be done with a new flag. There seem to be three possible places 
where this flag could be put:

* ifnet.if_flags
  This is a short and there is no free bit. But the IFF_NOTRAILERS bit has
  become unused recently and could be recycled.

* ifnet.if_xflags
  An int, lots of free bits. But comment says 'extra softnet flags'

* if_data.ifi_capabilities
  An u_int32_t, lots of free bits. 


In the diff below, I went with the first choice because the new IFF_NOMACFILTER
is somewhat similar to IFF_SIMPLEX and because the the check can then be 
nicely folded into the existing check for IFF_PROMISC.

I would welcome any comments, suggestions for a better flag name, OKs, ...

Cheers,
Stefan



diff --git sys/dev/pci/if_vio.c sys/dev/pci/if_vio.c
index 4cd80d5..22fd7cf 100644
--- sys/dev/pci/if_vio.c
+++ sys/dev/pci/if_vio.c
@@ -582,21 +582,21 @@ vio_attach(struct device *parent, struct device *self, 
void *aux)
virtio_start_vq_intr(vsc, >sc_vq[VQCTL]);
vsc->sc_nvqs = 3;
}
}
 
if (vio_alloc_mem(sc) < 0)
goto err;
 
strlcpy(ifp->if_xname, self->dv_xname, IFNAMSIZ);
ifp->if_softc = sc;
-   ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+   ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | 
IFF_NOMACFILTER;
ifp->if_start = vio_start;
ifp->if_ioctl = vio_ioctl;
ifp->if_capabilities = IFCAP_VLAN_MTU;
if (features & VIRTIO_NET_F_CSUM)
ifp->if_capabilities |= IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4;
IFQ_SET_MAXLEN(>if_snd, vsc->sc_vqs[1].vq_num - 1);
IFQ_SET_READY(>if_snd);
ifmedia_init(>sc_media, 0, vio_media_change, vio_media_status);
ifmedia_add(>sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
ifmedia_set(>sc_media, IFM_ETHER | IFM_AUTO);
diff --git sys/net/if.h sys/net/if.h
index 8d7e390..91c6d18 100644
--- sys/net/if.h
+++ sys/net/if.h
@@ -182,36 +182,37 @@ struct if_status_description {
 /*
  * Length of interface description, including terminating '\0'.
  */
 #defineIFDESCRSIZE 64
 
 #defineIFF_UP  0x1 /* interface is up */
 #defineIFF_BROADCAST   0x2 /* broadcast address valid */
 #defineIFF_DEBUG   0x4 /* turn on debugging */
 #defineIFF_LOOPBACK0x8 /* is a loopback net */
 #defineIFF_POINTOPOINT 0x10/* interface is point-to-point 
link */
-#defineIFF_NOTRAILERS  0x20/* avoid use of trailers */
+#defineIFF_NOMACFILTER 0x20/* Does not reliably filter 
unicast MACs */
 #defineIFF_RUNNING 0x40/* resources allocated */
 #defineIFF_NOARP   0x80/* no address resolution 
protocol */
 #defineIFF_PROMISC 0x100   /* receive all packets */
 #defineIFF_ALLMULTI0x200   /* receive all multicast 
packets */
 #defineIFF_OACTIVE 0x400   /* transmission in progress */
 #defineIFF_SIMPLEX 0x800   /* can't hear own transmissions 
*/
 #defineIFF_LINK0   0x1000  /* per link layer defined bit */
 #defineIFF_LINK1   0x2000  /* per link layer defined bit */
 #defineIFF_LINK2   0x4000  /* per link layer defined bit */
 #defineIFF_MULTICAST   0x8000  /* supports multicast */
 
+
 /* flags set internally only: */
 #defineIFF_CANTCHANGE \
(IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\
-   IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI)
+   IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_NOMACFILTER)
 
 #define IFXF_MPSAFE0x1 /* if_start is mpsafe */
 #defineIFXF_INET6_NOPRIVACY0x4 /* don't autoconf 
privacy */
 #defineIFXF_MPLS   0x8 /* 

Re: pppd and ioctl(TIOCSETD) (was: CVS: cvs.openbsd.org: src)

2015-12-22 Thread Stefan Fritsch
On Tue, 22 Dec 2015, David Coppa wrote:
> I suspect this broke my 3G connection:

Oops. Sorry. Try this diff (already committed):

--- sys/kern/tty_conf.c
+++ sys/kern/tty_conf.c
@@ -42,6 +42,11 @@
 #include 
 #include 
 
+#include "ppp.h"
+#include "nmea.h"
+#include "msts.h"
+#include "endrun.h"
+
 #definettynodisc ((int (*)(dev_t, struct tty *, struct proc *))enodev)
 #definettyerrclose ((int (*)(struct tty *, int flags, struct proc 
*))enodev)
 #definettyerrio ((int (*)(struct tty *, struct uio *, int))enodev)



Re: Looking for testers for amd64 suspend diff

2015-09-11 Thread Stefan Fritsch
On Sunday 06 September 2015 11:44:11, Stefan Fritsch wrote:
> the diff below is necessary to make suspend/resume work when x2apic
> is enabled (i.e. on qemu/kvm/...). While I don't expect problems,
> it would be nice if I could get some reports that this doesn't
> break suspend/resume on real machines, especially older ones (like
> core 2 duo and older) and ones with an amd cpu.

Committed now. Thanks again to everyone who tested it.


Re: Looking for testers for amd64 suspend diff

2015-09-07 Thread Stefan Fritsch
On Monday 07 September 2015 13:23:03, Alexander Bluhm wrote:
> Suspend OpenBSD in a fully emulated qemu running on OpenBSD works.
> After resume, typing in the shell of the virtualized machine works.
> But at disk access, all processes hang in biowait.
> 
> It is the same behavior with and without your diff.

If you use virtio disk this is expected, because the virtio drivers 
still lack the necessary suspend/resume logic.


Looking for testers for amd64 suspend diff

2015-09-06 Thread Stefan Fritsch
Hi,

the diff below is necessary to make suspend/resume work when x2apic is
enabled (i.e. on qemu/kvm/...). While I don't expect problems, it would be 
nice if I could get some reports that this doesn't break suspend/resume on 
real machines, especially older ones (like core 2 duo and older) and ones 
with an amd cpu.

Thanks in advance.

Cheers,
Stefan

--- a/sys/arch/amd64/amd64/acpi_wakecode.S
+++ b/sys/arch/amd64/amd64/acpi_wakecode.S
@@ -52,6 +52,7 @@
 #include 
 #include 
 #include 
+#include "lapic.h"
 
 #define _ACPI_TRMP_LABEL(a) a = . - _C_LABEL(acpi_real_mode_resume) + \
ACPI_TRAMPOLINE
@@ -245,6 +246,13 @@ _C_LABEL(acpi_long_mode_resume):
movw%ax, %gs
 
/* Restore registers - start with the MSRs */
+#if NLAPIC > 0
+   movl$MSR_APICBASE, %ecx
+   movlacpi_saved_apicbase, %eax
+   movlacpi_saved_apicbase+4, %edx
+   wrmsr
+#endif
+
movl$MSR_STAR, %ecx
movlacpi_saved_star, %eax
movlacpi_saved_star+4, %edx
@@ -623,6 +631,10 @@ _ACPI_TRMP_DATA_LABEL(acpi_saved_cstar)
.quad 0
 _ACPI_TRMP_DATA_LABEL(acpi_saved_sfmask)
.quad 0
+#if NLAPIC > 0
+_ACPI_TRMP_DATA_LABEL(acpi_saved_apicbase)
+   .quad 0
+#endif
 
.align 4
 _ACPI_TRMP_DATA_LABEL(acpi_pdirpa)
@@ -682,6 +694,13 @@ NENTRY(acpi_savecpu)

pushq   %rcx
pushq   %rdx
+#if NLAPIC > 0
+   movl$MSR_APICBASE, %ecx
+   rdmsr
+   movl%eax, acpi_saved_apicbase
+   movl%edx, acpi_saved_apicbase+4
+#endif
+
movl$MSR_STAR, %ecx
rdmsr
movl%eax, acpi_saved_star



Re: Possible fix for i217 problem

2015-08-05 Thread Stefan Fritsch
Thanks to everyone for the testing. The patch is now committed.



Possible fix for i217 problem

2015-08-04 Thread Stefan Fritsch
Hi,

someone mentioned to me the i217-LM problems that were reported on misc 
end of May. It is possible that the patch below helps.

For us, it fixed a problem on a laptop with i217-LM (pci id 8086:153a) 
where the receiving of packets would stop until the battery of the laptop 
was removed (or until linux or freebsd were booted, which also have this 
workaround). A normal reboot or power-cycle without removing the battery 
did not help. Interestingly, not even the Intel PXE BIOS has the 
workaround.

The problem would happen if the LAN cable was plugged in after the card 
had already been initialized. If the LAN cable was always plugged in when 
the laptop was powered on, the problem would not appear.

The workaround is part of the e1000_lv_jumbo_workaround_ich8lan() function 
in e1000_ich8lan.c in freebsd, but only the part that is used if jumbo 
packets are *not* configured. Linux has the same fix as 
b20a774495671f037e7160ea2ce87 and da1e2046e5f5ab268e55d30d6b74099ade0aeb6f 
with some more info in the commit messages.

This probably has quite some potential to cause regressions with other 
boards, so i am not sure if it should go in before 5.8 release.

Cheers,
Stefan


--- a/sys/dev/pci/if_em_hw.c
+++ b/sys/dev/pci/if_em_hw.c
@@ -91,6 +91,7 @@ static int32_tem_id_led_init(struct em_hw *);
 static int32_t em_init_lcd_from_nvm_config_region(struct em_hw *,  uint32_t,
uint32_t);
 static int32_t em_init_lcd_from_nvm(struct em_hw *);
+static int32_t em_phy_no_cable_workaround(struct em_hw *);
 static voidem_init_rx_addrs(struct em_hw *);
 static voidem_initialize_hardware_bits(struct em_hw *);
 static boolean_t em_is_onboard_nvm_eeprom(struct em_hw *);
@@ -7018,6 +7019,96 @@ em_read_mac_addr(struct em_hw *hw)
 }
 
 /**
+ * Explicitly disables jumbo frames and resets some PHY registers back to hw-
+ * defaults. This is necessary in case the ethernet cable was inserted AFTER
+ * the firmware initialized the PHY. Otherwise it is left in a state where
+ * it is possible to transmit but not receive packets. Observed on I217-LM and
+ * fixed in FreeBSD's sys/dev/e1000/e1000_ich8lan.c.
+ *
+ * hw - Struct containing variables accessed by shared code
+ */
+STATIC int32_t
+em_phy_no_cable_workaround(struct em_hw *hw) {
+   int32_t ret_val, dft_ret_val;
+   uint32_t mac_reg;
+   uint16_t data, phy_reg;
+
+   /* disable Rx path while enabling workaround */
+   em_read_phy_reg(hw, I2_DFT_CTRL, phy_reg);
+   ret_val = em_write_phy_reg(hw, I2_DFT_CTRL, phy_reg | (1  14));
+   if (ret_val)
+   return ret_val;
+
+   /* Write MAC register values back to h/w defaults */
+   mac_reg = E1000_READ_REG(hw, FFLT_DBG);
+   mac_reg = ~(0xF  14);
+   E1000_WRITE_REG(hw, FFLT_DBG, mac_reg);
+
+   mac_reg = E1000_READ_REG(hw, RCTL);
+   mac_reg = ~E1000_RCTL_SECRC;
+   E1000_WRITE_REG(hw, RCTL, mac_reg);
+
+   ret_val = em_read_kmrn_reg(hw, E1000_KUMCTRLSTA_OFFSET_CTRL, data);
+   if (ret_val)
+   goto out;
+   ret_val = em_write_kmrn_reg(hw, E1000_KUMCTRLSTA_OFFSET_CTRL,
+   data  ~(1  0));
+   if (ret_val)
+   goto out;
+
+   ret_val = em_read_kmrn_reg(hw, E1000_KUMCTRLSTA_OFFSET_HD_CTRL, data);
+   if (ret_val)
+   goto out;
+
+   data = ~(0xF  8);
+   data |= (0xB  8);
+   ret_val = em_write_kmrn_reg(hw, E1000_KUMCTRLSTA_OFFSET_HD_CTRL, data);
+   if (ret_val)
+   goto out;
+
+   /* Write PHY register values back to h/w defaults */
+   em_read_phy_reg(hw, I2_SMBUS_CTRL, data);
+   data = ~(0x7F  5);
+   ret_val = em_write_phy_reg(hw, I2_SMBUS_CTRL, data);
+   if (ret_val)
+   goto out;
+
+   em_read_phy_reg(hw, I2_MODE_CTRL, data);
+   data |= (1  13);
+   ret_val = em_write_phy_reg(hw, I2_MODE_CTRL, data);
+   if (ret_val)
+   goto out;
+
+   /*
+* 776.20 and 776.23 are not documented in
+* i217-ethernet-controller-datasheet.pdf...
+*/
+   em_read_phy_reg(hw, PHY_REG(776, 20), data);
+   data = ~(0x3FF  2);
+   data |= (0x8  2);
+   ret_val = em_write_phy_reg(hw, PHY_REG(776, 20), data);
+   if (ret_val)
+   goto out;
+
+   ret_val = em_write_phy_reg(hw, PHY_REG(776, 23), 0x7E00);
+   if (ret_val)
+   goto out;
+
+   em_read_phy_reg(hw, I2_PCIE_POWER_CTRL, data);
+   ret_val = em_write_phy_reg(hw, I2_PCIE_POWER_CTRL, data  ~(1  10));
+   if (ret_val)
+   goto out;
+
+out:
+   /* re-enable Rx path after enabling workaround */
+   dft_ret_val = em_write_phy_reg(hw, I2_DFT_CTRL, phy_reg  ~(1  14));
+   if (ret_val)
+   return ret_val;
+   else
+   return dft_ret_val;
+}
+

Re: usb hang related to xhci

2015-07-19 Thread Stefan Fritsch
On Sat, 18 Jul 2015, David Hill wrote:
 Whenever I plug a device into my USB ports, my machine locks hard.  I
 have the Intel Series 7 / C216 chip, so xhci attempts to route the port
 from ehci to xhci. 
 
 The following diff is from FreeBSD and makes my USB devices work again.
 https://github.com/freebsd/freebsd/blob/e79c62ff68fc74d88cb6f479859f6fae9baa5101/sys/dev/usb/controller/xhci_pci.c#L153-L176

I didn't have the lockup but my Intel Series 9 laptop does not recognice a 
USB 3.0 storage device if I insert it after a hot boot. If I insert it 
after a cold boot, it works ok.

This behavior does not change with your patch.

Cheers,
Stefan


 
 
 Index: sys/dev/pci/xhci_pci.c
 ===
 RCS file: /cvs/src/sys/dev/pci/xhci_pci.c,v
 retrieving revision 1.6
 diff -u -p -r1.6 xhci_pci.c
 --- sys/dev/pci/xhci_pci.c22 Jun 2015 08:43:27 -  1.6
 +++ sys/dev/pci/xhci_pci.c19 Jul 2015 02:20:06 -
 @@ -92,33 +92,45 @@ xhci_pci_match(struct device *parent, vo
  static int
  xhci_pci_port_route(struct xhci_pci_softc *psc)
  {
 - pcireg_t val;
 + pcireg_t val, usb2_mask, usb3_mask;
  
 - /*
 -  * Check USB3 Port Routing Mask register that indicates the ports
 -  * can be changed from OS, and turn on by USB3 Port SS Enable register.
 -  */
 - val = pci_conf_read(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_USB3PRM);
 - DPRINTF((%s: USB3PRM / USB3.0 configurable ports: 0x%08x\n,
 - psc-sc.sc_bus.bdev.dv_xname, val));
 +/*
 + * Check USB3 Port Routing Mask register that indicates the ports
 + * can be changed from OS, and turn on by USB3 Port SS Enable 
 register.
 + */
 +usb3_mask = pci_conf_read(psc-sc_pc, psc-sc_tag,
 + PCI_XHCI_INTEL_USB3PRM);
 +DPRINTF((%s: USB3PRM / USB3.0 configurable ports: 0x%08x\n,
 +psc-sc.sc_bus.bdev.dv_xname, usb3_mask));
 +
 +/*
 + * Check USB2 Port Routing Mask register that indicates the USB2.0
 + * ports to be controlled by xHCI HC, and switch them to xHCI HC.
 + */
 +usb2_mask = pci_conf_read(psc-sc_pc, psc-sc_tag,
 + PCI_XHCI_INTEL_XUSB2PRM);
 +DPRINTF((%s: XUSB2PRM / USB2.0 ports can switch from EHCI to xHCI:
 +0x%08x\n, psc-sc.sc_bus.bdev.dv_xname, val));
 +
 + val = pci_conf_read(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_USB3_PSSEN) 
 |
 + pci_conf_read(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_XUSB2PR);
  
 - pci_conf_write(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_USB3_PSSEN, val);
 +
 + pci_conf_write(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_USB3_PSSEN,
 + val  usb3_mask);
 +#ifdef XHCI_DEBUG
   val = pci_conf_read(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_USB3_PSSEN);
   DPRINTF((%s: USB3_PSSEN / Enabled USB3.0 ports under xHCI: 0x%08x\n,
   psc-sc.sc_bus.bdev.dv_xname, val));
 +#endif
  
 - /*
 -  * Check USB2 Port Routing Mask register that indicates the USB2.0
 -  * ports to be controlled by xHCI HC, and switch them to xHCI HC.
 -  */
 - val = pci_conf_read(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_XUSB2PRM);
 - DPRINTF((%s: XUSB2PRM / USB2.0 ports can switch from EHCI to xHCI:
 - 0x%08x\n, psc-sc.sc_bus.bdev.dv_xname, val));
 -
 - pci_conf_write(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_XUSB2PR, val);
 + pci_conf_write(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_XUSB2PR,
 + val  usb2_mask);
 +#ifdef XHCI_DEBUG
   val = pci_conf_read(psc-sc_pc, psc-sc_tag, PCI_XHCI_INTEL_XUSB2PR);
   DPRINTF((%s: XUSB2PR / USB2.0 ports under xHCI: 0x%08x\n,
   psc-sc.sc_bus.bdev.dv_xname, val));
 +#endif
  
   return (0);
  }
 
 



Call for testing for tty diff

2015-07-17 Thread Stefan Fritsch
While writing a virtio-console driver, I have found a bug in ttwrite() 
that can cause hangs. Below is a fix and after talking to Theo, I would 
like to know if the patch causes regressions for anyone, for example hangs 
in pty input/output, serial ports, etc. Thanks in advance.

Cheers,
Stefan


- Introduce new defines TTHIWATMINSPACE, TTMINHIWAT for some magic values
  that are used in tty.c.
- Remove hiwat adjustments in ttwrite(). This fixes the missing spltty().
- The above alone causs deadlocks with ptys. Change ttysetwater() to keep 
  at least TTHIWATMINSPACE space above the high water mark. This makes it
  consistent with ttycheckoutq() and seems to fix the pty deadlocks.


--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -1688,7 +1688,7 @@ ttycheckoutq(struct tty *tp, int wait)
hiwat = tp-t_hiwat;
s = spltty();
oldsig = wait ? curproc-p_siglist : 0;
-   if (tp-t_outq.c_cc  hiwat + 200)
+   if (tp-t_outq.c_cc  hiwat + TTHIWATMINSPACE)
while (tp-t_outq.c_cc  hiwat) {
ttstart(tp);
if (wait == 0 || curproc-p_siglist != oldsig) {
@@ -1823,7 +1823,7 @@ loop:
tp-t_rocount = 0;
if (ttyoutput(*cp, tp) = 0) {
/* out of space */
-   goto overfull;
+   goto ovhiwat;
}
cp++;
cc--;
@@ -1849,7 +1849,7 @@ loop:
tp-t_outcc += ce;
if (i  0) {
/* out of space */
-   goto overfull;
+   goto ovhiwat;
}
if (ISSET(tp-t_lflag, FLUSHO) ||
tp-t_outq.c_cc  hiwat)
@@ -1869,15 +1869,6 @@ done:
explicit_bzero(obuf, obufcc);
return (error);
 
-overfull:
-   /*
-* Since we are using ring buffers, if we can't insert any more into
-* the output queue, we can assume the ring is full and that someone
-* forgot to set the high water mark correctly.  We set it and then
-* proceed as normal.
-*/
-   hiwat = tp-t_outq.c_cc - 1;
-
 ovhiwat:
ttstart(tp);
s = spltty();
@@ -2114,7 +2105,7 @@ ttsetwater(struct tty *tp)
cps = tp-t_ospeed / 10;
tp-t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
x += cps;
-   tp-t_hiwat = CLAMP(x, tp-t_outq.c_cn, 100);
+   tp-t_hiwat = CLAMP(x, tp-t_outq.c_cn - TTHIWATMINSPACE, TTMINHIWAT);
 #undef CLAMP
 }
 
--- sys/sys/tty.h
+++ sys/sys/tty.h
@@ -171,6 +171,8 @@ struct itty {
 #ifdef _KERNEL
 #defineTTMAXLOWAT  256
 #defineTTMINLOWAT  32
+#defineTTMINHIWAT  100
+#defineTTHIWATMINSPACE 200 /* Min space above hiwat */
 #endif
 
 /* These flags are kept in t_state. */



Re: add m_defrag to pcn driver

2015-04-08 Thread Stefan Fritsch
On Wednesday 08 April 2015 09:48:14, David Gwynne wrote:
  +continue;
  +}
 
  
 
  Other drivers don't dequeue mbufs in out of mem situations. I
  think you  should just set IFF_OACTIVE and break out of the for
  loop (not shown in the diff). The old code just does the break,
  but I think setting IFF_OACTIVE would be correct here.
 
 OACTIVE means the tx ring is full. it (generally) gets cleared by a
 driver when the tx ring is emptied.
 
 if the tx ring is empty but we set it when the system runs out of
 resources (ie, memory shortages, dma mappings, iommu slot
 exhaustion, solar flares, etc), then there's no event to clear
 OACTIVE with. if we cant map the mbuf we should drop it and move
 on.

Sounds reasonable but many drivers (e.g. em, ix, bge) don't do that  
but set OACTIVE in this case. Is there a reason for that? From a 
casual glance at the source it is not clear to me that they have some 
other way to recover. Some have a watchdog that will reset the device 
after some time.



Re: add m_defrag to pcn driver

2015-04-07 Thread Stefan Fritsch
Hi,

I have committed the vio diff with some tweaks to remove some more now 
unused code. But I think this diff is actually incorrect:

On Tue, 31 Mar 2015, Kimberley Manning wrote:
 Index: if_pcn.c
 ===
 RCS file: /cvs/src/sys/dev/pci/if_pcn.c,v
 retrieving revision 1.36
 diff -u -p -r1.36 if_pcn.c
 --- if_pcn.c  14 Mar 2015 03:38:48 -  1.36
 +++ if_pcn.c  27 Mar 2015 12:17:24 -
 @@ -851,25 +851,23 @@ pcn_start(struct ifnet *ifp)
* were short on resources.  In this case, we'll copy
* and try again.
*/
 - if (bus_dmamap_load_mbuf(sc-sc_dmat, dmamap, m0,
 - BUS_DMA_WRITE|BUS_DMA_NOWAIT) != 0) {
 - MGETHDR(m, M_DONTWAIT, MT_DATA);
 - if (m == NULL)
 - break;
 - if (m0-m_pkthdr.len  MHLEN) {
 - MCLGET(m, M_DONTWAIT);
 - if ((m-m_flags  M_EXT) == 0) {
 - m_freem(m);
 - break;
 - }
 - }
 - m_copydata(m0, 0, m0-m_pkthdr.len, mtod(m, caddr_t));
 - m-m_pkthdr.len = m-m_len = m0-m_pkthdr.len;
 - error = bus_dmamap_load_mbuf(sc-sc_dmat, dmamap,
 - m, BUS_DMA_WRITE|BUS_DMA_NOWAIT);
 - if (error)
 - break;
 - }
 +error = bus_dmamap_load_mbuf(sc-sc_dmat, dmamap, m0,
 +BUS_DMA_WRITE|BUS_DMA_NOWAIT);
 +switch (error) {
 +case 0:
 +break; 
 +case EFBIG:
 +if ((error = m_defrag(m0, M_DONTWAIT)) == 0 
 +(error = bus_dmamap_load_mbuf(sc-sc_dmat, 
 dmamap,
 + m0, BUS_DMA_WRITE|BUS_DMA_NOWAIT)) == 0)
 +break;
 +
 +/* FALLTHROUGH */
 +default:
 + IFQ_DEQUEUE(ifp-if_snd, m0);
 + m_freem(m);

m is always NULL. Therefore you leak m0 in the error path. You should 
remove the m variable which is no longer necessary, and possibly rename m0 
to m. But the rename should probably be done in a second diff for easier 
review.

 + continue;
 +}

Other drivers don't dequeue mbufs in out of mem situations. I think you 
should just set IFF_OACTIVE and break out of the for loop (not shown in 
the diff). The old code just does the break, but I think setting 
IFF_OACTIVE would be correct here.


  
   /*
* Ensure we have enough descriptors free to describe
 
 



Re: add m_defrag to pcn driver

2015-04-01 Thread Stefan Fritsch
On Wednesday 01 April 2015 12:46:47, Kimberley Manning wrote:
   This diff makes the pcn driver use m_defrag for fragmented mbuf
   chains,
  
 
  I like this kind of cleanups.  As for vio(4) could you try the
  diff or are you looking for testers?
 
 Thanks. I've tried both the vio and pcn diffs and they worked for
 me, so dlg told me to write here. I'm not sure what happens to
 diffs after this point to be honest.

I can take a look at the vio diff and commit it in a few days.

BTW, is there a way to trigger heavily fragmented mbufs in order to 
test the code path?



Re: Binary code patching and paravirtualization

2014-12-16 Thread Stefan Fritsch
On Mon, 15 Dec 2014, Ted Unangst wrote:

 On Mon, Dec 15, 2014 at 23:55, Stefan Fritsch wrote:
  
  Only in order to get a flags field that can be tweaked with config(8). And
  to allow disable via config(8), though that could also be achieved with a
  flag.
  
  Tweaking the behavior with a flags value is necessary because hypervisors
  not always announce what they are capable of. One reason for this is that
  qemu is responsible for setting most of the cpuid flags but the kvm kernel
  module offers some interfaces in all cases. Another reason seems to be
  simple oversight, for example Illuminos KVM forgot to add the cpuid stuff
  from Linux KVM. And if there is some bug it may be a good idea to have a
  simple way to check if it is related to the paravirt stuff.
  
  I have just noticed that there is support in config(8) to set non-devices
  related int values. But this does not seem to be supported in the
  in-kernel UKC. And I can't see right now how config finds the valid int
  values in the kernel. Or is it just necessary to add the name of the
  variable prepended by an underscore to the config tool? Would this be
  preferred over introducing the paravirt device? But being able to set this
  from the in-kernel UKC would be nice, too.

What I have forgotten is that just because a hypervisor supports a 
specific interface, it does not always mean that it is a good idea to use 
it. For example, recent Intel CPUs with a recent Linux/KVM have better 
hardware support for APIC virtualization and using the Hyper-V APIC access 
MSRs actually slow things down.

 I think it would be better to avoid fake device proliferation, but
 others may have other opinions.
 
 So the problem is that some hypervisors are broken and don't
 identify as such? Perhaps we ignore them to start? Then we can add a
 mechanism to force paravirt code patching.

 I think the introduction of paravirt code patching and the mechanism
 used to enable or disable are separate issues. It can start as a
 normal option PARAVIRT, and then we discuss what else needs to be done?

Yes, we can first start with some things that are unproblematic 
WRT detection. AFAIK, this is the case for the PV EOI optimization.


Unfortunately, the bit that gives the largest performance boost at the 
moment, the APIC access MSRs, is not so easy:
- KVM has supported it for a long time
- but it is only announced if qemu is started with a special command line 
option
- Illumos KVM supports it too, but there is no way to announce it

Also, old hypervisors tend to stay around for a long time and users can't 
always influence the configuration when they rent a vserver.

On the other hand, the same positive effect as the APIC access MSRs should 
also be achievable by adding x2apic support. kettenis wanted to check how 
much effort that would be. So, I think it would be best to wait a bit with 
this piece.



Re: Binary code patching and paravirtualization

2014-12-15 Thread Stefan Fritsch
On Thu, 11 Dec 2014, Mark Kettenis wrote:

  From: Alexey Suslikov alexey.susli...@gmail.com
  Date: Thu, 11 Dec 2014 20:51:14 + (UTC)
  
  Stefan Fritsch sf at sfritsch.de writes:
  
   --- a/sys/arch/amd64/include/specialreg.h
   +++ b/sys/arch/amd64/include/specialreg.h
at  at  -158,6 +158,7  at  at 
#define  CPUIDECX_AVX0x1000  /* Advanced Vector Extensions 
  */
#define  CPUIDECX_F16C   0x2000  /* 16bit fp conversion  */
#define  CPUIDECX_RDRAND 0x4000  /* RDRAND instruction  */
   +#define  CPUIDECX_HYPERV 0x8000  /* Hypervisor present */
  
  Is this flag standardized? Last time I have tried to push this, there
  was an objection based on reserved for future use status of this flag.
  
  See http://marc.info/?l=openbsd-bugsm=136907278229145w=2
 
 Well, that thread started out with a questionable workaround for a
 hypervisor bug.  That may have have influenced the debate about the
 flag a bit.
 
 You can be almost certain that Intel and AMD will not use that
 reserved bit for anything else.  The Linux KVM virtualization business
 is too important for them.  And if Microsoft Hyper-V or VMWare ESX
 sets that bit as well, this becomes an absolute certainty.

The intel manual says Not Used, Always returns 0 which is different from 
reserved, which is stated for other bits.

FTR, jasper@ checked that vmware sets the bit while virtual box does not. 
So, many but not all hypervisors set it.

 I prefer the CPUIDECX_HV name used in the diff you posted in:

OK?

diff --git a/sys/arch/amd64/amd64/identcpu.c b/sys/arch/amd64/amd64/identcpu.c
--- a/sys/arch/amd64/amd64/identcpu.c
+++ b/sys/arch/amd64/amd64/identcpu.c
@@ -129,6 +129,7 @@ const struct {
{ CPUIDECX_AVX, AVX },
{ CPUIDECX_F16C,F16C },
{ CPUIDECX_RDRAND,  RDRAND },
+   { CPUIDECX_HV,  HV },
 }, cpu_ecpuid_ecxfeatures[] = {
{ CPUIDECX_LAHF,LAHF },
{ CPUIDECX_CMPLEG,  CMPLEG },
diff --git a/sys/arch/amd64/include/specialreg.h 
b/sys/arch/amd64/include/specialreg.h
--- a/sys/arch/amd64/include/specialreg.h
+++ b/sys/arch/amd64/include/specialreg.h
@@ -158,6 +158,7 @@
 #defineCPUIDECX_AVX0x1000  /* Advanced Vector Extensions */
 #defineCPUIDECX_F16C   0x2000  /* 16bit fp conversion  */
 #defineCPUIDECX_RDRAND 0x4000  /* RDRAND instruction  */
+#defineCPUIDECX_HV 0x8000  /* Running on hypervisor */
 
 /*
  * Structured Extended Feature Flags Parameters (CPUID function 0x7, leaf 0)



Re: Binary code patching and paravirtualization

2014-12-15 Thread Stefan Fritsch
On Mon, 15 Dec 2014, Ted Unangst wrote:

 On Wed, Dec 10, 2014 at 00:32, Stefan Fritsch wrote:
 
  --- a/sys/arch/amd64/conf/GENERIC
  +++ b/sys/arch/amd64/conf/GENERIC
  @@ -39,6 +39,8 @@ isa0  at amdpcib?
  isa0at tcpcib?
  pci*at mainbus0
  
  +paravirt0 at mainbus0
  +
  acpi0   at bios0
  acpitimer*  at acpi?
  acpihpet*   at acpi?
  diff --git a/sys/arch/amd64/conf/files.amd64
  b/sys/arch/amd64/conf/files.amd64
  index 103b653..a73d5d8 100644
  --- a/sys/arch/amd64/conf/files.amd64
  +++ b/sys/arch/amd64/conf/files.amd64
  @@ -80,6 +80,12 @@ device   mainbus: isabus, pcibus, mainbus
  attach  mainbus at root
  filearch/amd64/amd64/mainbus.c  mainbus
  
  +device paravirt
  +attach paravirt at mainbus
  +file   arch/amd64/amd64/paravirt.c paravirt needs-flag
  +
  +file   arch/amd64/amd64/codepatch.c
 
 Can you explain why this needs to be a device? It doesn't appear to be
 doing any device like things.
 

Only in order to get a flags field that can be tweaked with config(8). And 
to allow disable via config(8), though that could also be achieved with a 
flag.

Tweaking the behavior with a flags value is necessary because hypervisors 
not always announce what they are capable of. One reason for this is that 
qemu is responsible for setting most of the cpuid flags but the kvm kernel 
module offers some interfaces in all cases. Another reason seems to be 
simple oversight, for example Illuminos KVM forgot to add the cpuid stuff 
from Linux KVM. And if there is some bug it may be a good idea to have a 
simple way to check if it is related to the paravirt stuff.

I have just noticed that there is support in config(8) to set non-devices 
related int values. But this does not seem to be supported in the 
in-kernel UKC. And I can't see right now how config finds the valid int 
values in the kernel. Or is it just necessary to add the name of the 
variable prepended by an underscore to the config tool? Would this be 
preferred over introducing the paravirt device? But being able to set this 
from the in-kernel UKC would be nice, too.



  1   2   >