On Wed, Jun 17, 2020 at 01:17:37PM +0200, Hrvoje Popovski wrote:
> On 17.6.2020. 13:13, Jonathan Matthew wrote:
> > On Wed, Jun 17, 2020 at 12:50:46PM +0200, Hrvoje Popovski wrote:
> >> On 17.6.2020. 12:45, Hrvoje Popovski wrote:
> >>> On 17.6.2020. 11:27, Hrvoje Popovski wrote:
> >>>> On 17.6.2020. 10:36, David Gwynne wrote:
> >>>>> this is an updated version of a diff from christiano haesbaert by way of
> >>>>> mpi@ to enable the use of multiple tx and rx rings with msi-x.
> >>>>>
> >>>>> the high level description is that that driver checks to see if msix is
> >>>>> available, and if so how many vectors it has. it then gets an intrmap
> >>>>> based on that information, and bumps the number of queues to the number
> >>>>> of cpus that intrmap says are available.
> >>>>>
> >>>>> once the queues are allocated, it then iterates over them and wires up
> >>>>> interrupts to the cpus provided by the intrmap.
> >>>>>
> >>>>> im happy for people to try this out, but i can't commit it until all the
> >>>>> architectures that ix(4) is enabled on support the APIs that it's using.
> >>>>> this basically means it'll work on amd64 (and a little bit on i386), but
> >>>>> not much else. please hold back your tears and cries of anguish.
> >>>>>
> >>>>> thanks to christiano and mpi for doing most of the work leading up to
> >>>>> this diff :)
> >>>>
> >>>> Hi,
> >>>>
> >>>> first, thank you all for mq work :)
> >>>>
> >>>> with this diff, if i'm sending traffic over ix and at the same time
> >>>> execute ifconfig ix down/up, forwarding stops until i stop generator,
> >>>> wait for few seconds and execute ifconfig ix down/up few times and than
> >>>> forwarding start normally
i'll have to wire up a topology i can test this properly with when i get
back to the office.
> >> in vmstat i should see ix0:0-5 and ix1:0-5 ?
> >
> > vmstat -i only shows interrupts that have actually fired. Use -zi to show
> > all interrupts.
> >
> > This diff doesn't set up RSS, so received packets will only go to the first
> > vector, which is why only one of the ix1 interrupts has fired. Outgoing
> > packets are scattered across the tx queues, so all the ix0 interrupts have
> > fired.
>
> yes, thank you ..
i had a look at setting up RSS today, and it turns out it's already
there. it's using a randomly generated toeplitz key (which is ok), but
it only hashes ipv4, ipv6, and tcp. if your test traffic is udp
between the same 2 IPs, it'll land on the same rx ring.
anyway, here's an updated diff with a couple of tweaks. firstly, it uses
intr_barrier when the interface is going down to make sure the rings
arent in use on another cpu, and secondly it wires ix up with the
stoeplitz code.
Index: if_ix.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.166
diff -u -p -r1.166 if_ix.c
--- if_ix.c 7 Jun 2020 23:52:05 -0000 1.166
+++ if_ix.c 19 Jun 2020 05:02:31 -0000
@@ -115,7 +115,7 @@ void ixgbe_identify_hardware(struct ix_s
int ixgbe_allocate_pci_resources(struct ix_softc *);
int ixgbe_allocate_legacy(struct ix_softc *);
int ixgbe_allocate_msix(struct ix_softc *);
-int ixgbe_setup_msix(struct ix_softc *);
+void ixgbe_setup_msix(struct ix_softc *);
int ixgbe_allocate_queues(struct ix_softc *);
void ixgbe_free_pci_resources(struct ix_softc *);
void ixgbe_local_timer(void *);
@@ -199,7 +199,7 @@ struct cfattach ix_ca = {
};
int ixgbe_smart_speed = ixgbe_smart_speed_on;
-int ixgbe_enable_msix = 0;
+int ixgbe_enable_msix = 1;
/*********************************************************************
* Device identification routine
@@ -301,7 +301,7 @@ ixgbe_attach(struct device *parent, stru
bcopy(sc->hw.mac.addr, sc->arpcom.ac_enaddr,
IXGBE_ETH_LENGTH_OF_ADDRESS);
- if (sc->msix > 1)
+ if (sc->sc_intrmap)
error = ixgbe_allocate_msix(sc);
else
error = ixgbe_allocate_legacy(sc);
@@ -798,7 +798,7 @@ ixgbe_init(void *arg)
timeout_add_sec(&sc->timer, 1);
/* Set up MSI/X routing */
- if (sc->msix > 1) {
+ if (sc->sc_intrmap) {
ixgbe_configure_ivars(sc);
/* Set up auto-mask */
if (sc->hw.mac.type == ixgbe_mac_82598EB)
@@ -829,7 +829,7 @@ ixgbe_init(void *arg)
itr |= IXGBE_EITR_LLI_MOD | IXGBE_EITR_CNT_WDIS;
IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(0), itr);
- if (sc->msix > 1) {
+ if (sc->sc_intrmap) {
/* Set moderation on the Link interrupt */
IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(sc->linkvec),
IXGBE_LINK_ITR);
@@ -903,7 +903,7 @@ ixgbe_config_gpie(struct ix_softc *sc)
gpie |= 0xf << IXGBE_GPIE_LLI_DELAY_SHIFT;
}
- if (sc->msix > 1) {
+ if (sc->sc_intrmap) {
/* Enable Enhanced MSIX mode */
gpie |= IXGBE_GPIE_MSIX_MODE;
gpie |= IXGBE_GPIE_EIAME | IXGBE_GPIE_PBA_SUPPORT |
@@ -1617,6 +1617,7 @@ ixgbe_stop(void *arg)
ifq_barrier(ifq);
ifq_clr_oactive(ifq);
+ intr_barrier(sc->queues[i].tag);
timeout_del(&sc->rx_rings[i].rx_refill);
}
@@ -1717,80 +1718,86 @@ ixgbe_allocate_msix(struct ix_softc *sc)
{
struct ixgbe_osdep *os = &sc->osdep;
struct pci_attach_args *pa = &os->os_pa;
- int vec, error = 0;
- struct ix_queue *que = sc->queues;
- const char *intrstr = NULL;
- pci_chipset_tag_t pc = pa->pa_pc;
+ int i = 0, error = 0;
+ struct ix_queue *que;
pci_intr_handle_t ih;
- vec = 0;
- if (pci_intr_map_msix(pa, vec, &ih)) {
- printf(": couldn't map interrupt\n");
- return (ENXIO);
- }
+ for (i = 0, que = sc->queues; i < sc->num_queues; i++, que++) {
+ if (pci_intr_map_msix(pa, i, &ih)) {
+ printf("ixgbe_allocate_msix: "
+ "pci_intr_map_msix vec %d failed\n", i);
+ error = ENOMEM;
+ goto fail;
+ }
- que->msix = vec;
- snprintf(que->name, sizeof(que->name), "%s:%d", sc->dev.dv_xname, vec);
+ que->tag = pci_intr_establish_cpu(pa->pa_pc, ih,
+ IPL_NET | IPL_MPSAFE, intrmap_cpu(sc->sc_intrmap, i),
+ ixgbe_queue_intr, que, que->name);
+ if (que->tag == NULL) {
+ printf("ixgbe_allocate_msix: "
+ "pci_intr_establish vec %d failed\n", i);
+ error = ENOMEM;
+ goto fail;
+ }
- intrstr = pci_intr_string(pc, ih);
- que->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
- ixgbe_queue_intr, que, que->name);
- if (que->tag == NULL) {
- printf(": couldn't establish interrupt");
- if (intrstr != NULL)
- printf(" at %s", intrstr);
- printf("\n");
- return (ENXIO);
+ que->msix = i;
}
/* Now the link status/control last MSI-X vector */
- vec++;
- if (pci_intr_map_msix(pa, vec, &ih)) {
- printf(": couldn't map link vector\n");
- error = ENXIO;
+ if (pci_intr_map_msix(pa, i, &ih)) {
+ printf("ixgbe_allocate_msix: "
+ "pci_intr_map_msix link vector failed\n");
+ error = ENOMEM;
goto fail;
}
- intrstr = pci_intr_string(pc, ih);
- sc->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
+ sc->tag = pci_intr_establish(pa->pa_pc, ih, IPL_NET | IPL_MPSAFE,
ixgbe_link_intr, sc, sc->dev.dv_xname);
if (sc->tag == NULL) {
- printf(": couldn't establish interrupt");
- if (intrstr != NULL)
- printf(" at %s", intrstr);
- printf("\n");
- error = ENXIO;
+ printf("ixgbe_allocate_msix: "
+ "pci_intr_establish link vector failed\n");
+ error = ENOMEM;
goto fail;
}
-
- sc->linkvec = vec;
- printf(", %s, %d queue%s", intrstr, vec, (vec > 1) ? "s" : "");
+ sc->linkvec = i;
+ printf(", %s, %d queue%s", pci_intr_string(pa->pa_pc, ih),
+ i, (i > 1) ? "s" : "");
return (0);
fail:
- pci_intr_disestablish(pc, que->tag);
- que->tag = NULL;
+ for (que = sc->queues; i > 0; i--, que++) {
+ if (que->tag == NULL)
+ continue;
+ pci_intr_disestablish(pa->pa_pc, que->tag);
+ que->tag = NULL;
+ }
+
return (error);
}
-int
+void
ixgbe_setup_msix(struct ix_softc *sc)
{
struct ixgbe_osdep *os = &sc->osdep;
struct pci_attach_args *pa = &os->os_pa;
- pci_intr_handle_t dummy;
+ int nmsix;
+ unsigned int maxq;
if (!ixgbe_enable_msix)
- return (0);
+ return;
- /*
- * Try a dummy map, maybe this bus doesn't like MSI, this function
- * has no side effects.
- */
- if (pci_intr_map_msix(pa, 0, &dummy))
- return (0);
+ nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+ if (nmsix <= 1)
+ return;
+
+ /* give one vector to events */
+ nmsix--;
- return (2); /* queue vector + link vector */
+ /* XXX the number of queues is limited to what we can keep stats on */
+ maxq = (sc->hw.mac.type == ixgbe_mac_82598EB) ? 8 : 16;
+
+ sc->sc_intrmap = intrmap_create(&sc->dev, nmsix, maxq, 0);
+ sc->num_queues = intrmap_count(sc->sc_intrmap);
}
int
@@ -1818,7 +1825,7 @@ ixgbe_allocate_pci_resources(struct ix_s
sc->hw.back = os;
/* Now setup MSI or MSI/X, return us the number of supported vectors. */
- sc->msix = ixgbe_setup_msix(sc);
+ ixgbe_setup_msix(sc);
return (0);
}
@@ -2159,6 +2166,8 @@ ixgbe_allocate_queues(struct ix_softc *s
que->sc = sc;
que->txr = &sc->tx_rings[i];
que->rxr = &sc->rx_rings[i];
+ snprintf(que->name, sizeof(que->name), "%s:%d",
+ sc->dev.dv_xname, i);
}
return (0);
@@ -2925,7 +2934,7 @@ ixgbe_initialize_rss_mapping(struct ix_s
int i, j, queue_id, table_size, index_mult;
/* set up random bits */
- arc4random_buf(&rss_key, sizeof(rss_key));
+ stoeplitz_to_key(&rss_key, sizeof(rss_key));
/* Set multiplier for RETA setup and table size based on MAC */
index_mult = 0x1;
@@ -3062,6 +3071,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
i = rxr->next_to_check;
while (if_rxr_inuse(&rxr->rx_ring) > 0) {
+ uint32_t hash, hashtype;
+
bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
dsize * i, dsize, BUS_DMASYNC_POSTREAD);
@@ -3089,6 +3100,9 @@ ixgbe_rxeof(struct rx_ring *rxr)
IXGBE_RXDADV_PKTTYPE_MASK;
vtag = letoh16(rxdesc->wb.upper.vlan);
eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0);
+ hash = lemtoh32(&rxdesc->wb.lower.hi_dword.rss);
+ hashtype = lemtoh32(&rxdesc->wb.lower.lo_dword.data) &
+ IXGBE_RXDADV_RSSTYPE_MASK;
if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) {
sc->dropped_pkts++;
@@ -3163,6 +3177,11 @@ ixgbe_rxeof(struct rx_ring *rxr)
ixgbe_rx_checksum(staterr, sendmp, ptype);
+ if (hashtype != IXGBE_RXDADV_RSSTYPE_NONE) {
+ sendmp->m_pkthdr.ph_flowid = hash;
+ SET(sendmp->m_pkthdr.csum_flags, M_FLOWID);
+ }
+
ml_enqueue(&ml, sendmp);
}
next_desc:
@@ -3295,7 +3314,7 @@ ixgbe_enable_intr(struct ix_softc *sc)
IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask);
/* With MSI-X we use auto clear */
- if (sc->msix > 1) {
+ if (sc->sc_intrmap) {
mask = IXGBE_EIMS_ENABLE_MASK;
/* Don't autoclear Link */
mask &= ~IXGBE_EIMS_OTHER;
@@ -3309,7 +3328,7 @@ ixgbe_enable_intr(struct ix_softc *sc)
void
ixgbe_disable_intr(struct ix_softc *sc)
{
- if (sc->msix > 1)
+ if (sc->sc_intrmap)
IXGBE_WRITE_REG(&sc->hw, IXGBE_EIAC, 0);
if (sc->hw.mac.type == ixgbe_mac_82598EB) {
IXGBE_WRITE_REG(&sc->hw, IXGBE_EIMC, ~0);
Index: if_ix.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.h,v
retrieving revision 1.41
diff -u -p -r1.41 if_ix.h
--- if_ix.h 7 Jun 2020 23:52:05 -0000 1.41
+++ if_ix.h 19 Jun 2020 05:02:31 -0000
@@ -234,7 +234,7 @@ struct ix_softc {
struct ifmedia media;
struct timeout timer;
- int msix;
+ struct intrmap *sc_intrmap;
int if_flags;
uint16_t num_vlans;
Index: ixgbe.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/ixgbe.h,v
retrieving revision 1.29
diff -u -p -r1.29 ixgbe.h
--- ixgbe.h 2 Mar 2020 01:59:01 -0000 1.29
+++ ixgbe.h 19 Jun 2020 05:02:31 -0000
@@ -53,10 +53,12 @@
#include <sys/timeout.h>
#include <sys/pool.h>
#include <sys/rwlock.h>
+#include <sys/intrmap.h>
#include <sys/atomic.h>
#include <net/if.h>
#include <net/if_media.h>
+#include <net/toeplitz.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
Index: files.pci
===================================================================
RCS file: /cvs/src/sys/dev/pci/files.pci,v
retrieving revision 1.347
diff -u -p -r1.347 files.pci
--- files.pci 17 Jun 2020 06:36:17 -0000 1.347
+++ files.pci 19 Jun 2020 05:02:31 -0000
@@ -343,7 +343,7 @@ file dev/pci/ixgb_ee.c ixgb
file dev/pci/ixgb_hw.c ixgb
# Intel 82598 10GbE
-device ix: ether, ifnet, ifmedia
+device ix: ether, ifnet, ifmedia, intrmap, stoeplitz
attach ix at pci
file dev/pci/if_ix.c ix
file dev/pci/ixgbe.c ix