On Wed, Jun 17, 2020 at 01:17:37PM +0200, Hrvoje Popovski wrote:
> On 17.6.2020. 13:13, Jonathan Matthew wrote:
> > On Wed, Jun 17, 2020 at 12:50:46PM +0200, Hrvoje Popovski wrote:
> >> On 17.6.2020. 12:45, Hrvoje Popovski wrote:
> >>> On 17.6.2020. 11:27, Hrvoje Popovski wrote:
> >>>> On 17.6.2020. 10:36, David Gwynne wrote:
> >>>>> this is an updated version of a diff from christiano haesbaert by way of
> >>>>> mpi@ to enable the use of multiple tx and rx rings with msi-x.
> >>>>>
> >>>>> the high level description is that that driver checks to see if msix is
> >>>>> available, and if so how many vectors it has. it then gets an intrmap
> >>>>> based on that information, and bumps the number of queues to the number
> >>>>> of cpus that intrmap says are available.
> >>>>>
> >>>>> once the queues are allocated, it then iterates over them and wires up
> >>>>> interrupts to the cpus provided by the intrmap.
> >>>>>
> >>>>> im happy for people to try this out, but i can't commit it until all the
> >>>>> architectures that ix(4) is enabled on support the APIs that it's using.
> >>>>> this basically means it'll work on amd64 (and a little bit on i386), but
> >>>>> not much else. please hold back your tears and cries of anguish.
> >>>>>
> >>>>> thanks to christiano and mpi for doing most of the work leading up to
> >>>>> this diff :)
> >>>>
> >>>> Hi,
> >>>>
> >>>> first, thank you all for mq work :)
> >>>>
> >>>> with this diff, if i'm sending traffic over ix and at the same time
> >>>> execute ifconfig ix down/up, forwarding stops until i stop generator,
> >>>> wait for few seconds and execute ifconfig ix down/up few times and than
> >>>> forwarding start normally

i'll have to wire up a topology i can test this properly with when i get
back to the office.

> >> in vmstat i should see ix0:0-5 and ix1:0-5 ?
> > 
> > vmstat -i only shows interrupts that have actually fired. Use -zi to show
> > all interrupts.
> > 
> > This diff doesn't set up RSS, so received packets will only go to the first
> > vector, which is why only one of the ix1 interrupts has fired. Outgoing
> > packets are scattered across the tx queues, so all the ix0 interrupts have
> > fired.
> 
> yes, thank you ..

i had a look at setting up RSS today, and it turns out it's already
there. it's using a randomly generated toeplitz key (which is ok), but
it only hashes ipv4, ipv6, and tcp. if your test traffic is udp
between the same 2 IPs, it'll land on the same rx ring.

anyway, here's an updated diff with a couple of tweaks. firstly, it uses
intr_barrier when the interface is going down to make sure the rings
arent in use on another cpu, and secondly it wires ix up with the
stoeplitz code.

Index: if_ix.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.c,v
retrieving revision 1.166
diff -u -p -r1.166 if_ix.c
--- if_ix.c     7 Jun 2020 23:52:05 -0000       1.166
+++ if_ix.c     19 Jun 2020 05:02:31 -0000
@@ -115,7 +115,7 @@ void        ixgbe_identify_hardware(struct ix_s
 int    ixgbe_allocate_pci_resources(struct ix_softc *);
 int    ixgbe_allocate_legacy(struct ix_softc *);
 int    ixgbe_allocate_msix(struct ix_softc *);
-int    ixgbe_setup_msix(struct ix_softc *);
+void   ixgbe_setup_msix(struct ix_softc *);
 int    ixgbe_allocate_queues(struct ix_softc *);
 void   ixgbe_free_pci_resources(struct ix_softc *);
 void   ixgbe_local_timer(void *);
@@ -199,7 +199,7 @@ struct cfattach ix_ca = {
 };
 
 int ixgbe_smart_speed = ixgbe_smart_speed_on;
-int ixgbe_enable_msix = 0;
+int ixgbe_enable_msix = 1;
 
 /*********************************************************************
  *  Device identification routine
@@ -301,7 +301,7 @@ ixgbe_attach(struct device *parent, stru
        bcopy(sc->hw.mac.addr, sc->arpcom.ac_enaddr,
            IXGBE_ETH_LENGTH_OF_ADDRESS);
 
-       if (sc->msix > 1)
+       if (sc->sc_intrmap)
                error = ixgbe_allocate_msix(sc);
        else
                error = ixgbe_allocate_legacy(sc);
@@ -798,7 +798,7 @@ ixgbe_init(void *arg)
        timeout_add_sec(&sc->timer, 1);
 
        /* Set up MSI/X routing */
-       if (sc->msix > 1) {
+       if (sc->sc_intrmap) {
                ixgbe_configure_ivars(sc);
                /* Set up auto-mask */
                if (sc->hw.mac.type == ixgbe_mac_82598EB)
@@ -829,7 +829,7 @@ ixgbe_init(void *arg)
                itr |= IXGBE_EITR_LLI_MOD | IXGBE_EITR_CNT_WDIS;
        IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(0), itr);
 
-       if (sc->msix > 1) {
+       if (sc->sc_intrmap) {
                /* Set moderation on the Link interrupt */
                IXGBE_WRITE_REG(&sc->hw, IXGBE_EITR(sc->linkvec),
                    IXGBE_LINK_ITR);
@@ -903,7 +903,7 @@ ixgbe_config_gpie(struct ix_softc *sc)
                gpie |= 0xf << IXGBE_GPIE_LLI_DELAY_SHIFT;
        }
 
-       if (sc->msix > 1) {
+       if (sc->sc_intrmap) {
                /* Enable Enhanced MSIX mode */
                gpie |= IXGBE_GPIE_MSIX_MODE;
                gpie |= IXGBE_GPIE_EIAME | IXGBE_GPIE_PBA_SUPPORT |
@@ -1617,6 +1617,7 @@ ixgbe_stop(void *arg)
                ifq_barrier(ifq);
                ifq_clr_oactive(ifq);
 
+               intr_barrier(sc->queues[i].tag);
                timeout_del(&sc->rx_rings[i].rx_refill);
        }
 
@@ -1717,80 +1718,86 @@ ixgbe_allocate_msix(struct ix_softc *sc)
 {
        struct ixgbe_osdep      *os = &sc->osdep;
        struct pci_attach_args  *pa  = &os->os_pa;
-       int                      vec, error = 0;
-       struct ix_queue         *que = sc->queues;
-       const char              *intrstr = NULL;
-       pci_chipset_tag_t       pc = pa->pa_pc;
+       int                      i = 0, error = 0;
+       struct ix_queue         *que;
        pci_intr_handle_t       ih;
 
-       vec = 0;
-       if (pci_intr_map_msix(pa, vec, &ih)) {
-               printf(": couldn't map interrupt\n");
-               return (ENXIO);
-       }
+       for (i = 0, que = sc->queues; i < sc->num_queues; i++, que++) {
+               if (pci_intr_map_msix(pa, i, &ih)) {
+                       printf("ixgbe_allocate_msix: "
+                           "pci_intr_map_msix vec %d failed\n", i);
+                       error = ENOMEM;
+                       goto fail;
+               }
 
-       que->msix = vec;
-       snprintf(que->name, sizeof(que->name), "%s:%d", sc->dev.dv_xname, vec);
+               que->tag = pci_intr_establish_cpu(pa->pa_pc, ih,
+                   IPL_NET | IPL_MPSAFE, intrmap_cpu(sc->sc_intrmap, i),
+                   ixgbe_queue_intr, que, que->name);
+               if (que->tag == NULL) {
+                       printf("ixgbe_allocate_msix: "
+                           "pci_intr_establish vec %d failed\n", i);
+                       error = ENOMEM;
+                       goto fail;
+               }
 
-       intrstr = pci_intr_string(pc, ih);
-       que->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
-           ixgbe_queue_intr, que, que->name);
-       if (que->tag == NULL) {
-               printf(": couldn't establish interrupt");
-               if (intrstr != NULL)
-                       printf(" at %s", intrstr);
-               printf("\n");
-               return (ENXIO);
+               que->msix = i;
        }
 
        /* Now the link status/control last MSI-X vector */
-       vec++;
-       if (pci_intr_map_msix(pa, vec, &ih)) {
-               printf(": couldn't map link vector\n");
-               error = ENXIO;
+       if (pci_intr_map_msix(pa, i, &ih)) {
+               printf("ixgbe_allocate_msix: "
+                   "pci_intr_map_msix link vector failed\n");
+               error = ENOMEM;
                goto fail;
        }
 
-       intrstr = pci_intr_string(pc, ih);
-       sc->tag = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
+       sc->tag = pci_intr_establish(pa->pa_pc, ih, IPL_NET | IPL_MPSAFE,
            ixgbe_link_intr, sc, sc->dev.dv_xname);
        if (sc->tag == NULL) {
-               printf(": couldn't establish interrupt");
-               if (intrstr != NULL)
-                       printf(" at %s", intrstr);
-               printf("\n");
-               error = ENXIO;
+               printf("ixgbe_allocate_msix: "
+                   "pci_intr_establish link vector failed\n");
+               error = ENOMEM;
                goto fail;
        }
-
-       sc->linkvec = vec;
-       printf(", %s, %d queue%s", intrstr, vec, (vec > 1) ? "s" : "");
+       sc->linkvec = i;
+       printf(", %s, %d queue%s", pci_intr_string(pa->pa_pc, ih),
+           i, (i > 1) ? "s" : "");
 
        return (0);
 fail:
-       pci_intr_disestablish(pc, que->tag);
-       que->tag = NULL;
+       for (que = sc->queues; i > 0; i--, que++) {
+               if (que->tag == NULL)
+                       continue;
+               pci_intr_disestablish(pa->pa_pc, que->tag);
+               que->tag = NULL;
+       }
+
        return (error);
 }
 
-int
+void
 ixgbe_setup_msix(struct ix_softc *sc)
 {
        struct ixgbe_osdep      *os = &sc->osdep;
        struct pci_attach_args  *pa = &os->os_pa;
-       pci_intr_handle_t        dummy;
+       int                      nmsix;
+       unsigned int             maxq;
 
        if (!ixgbe_enable_msix)
-               return (0);
+               return;
 
-       /*
-        * Try a dummy map, maybe this bus doesn't like MSI, this function
-        * has no side effects.
-        */
-       if (pci_intr_map_msix(pa, 0, &dummy))
-               return (0);
+       nmsix = pci_intr_msix_count(pa->pa_pc, pa->pa_tag);
+       if (nmsix <= 1)
+               return;
+
+       /* give one vector to events */
+       nmsix--;
 
-       return (2); /* queue vector + link vector */
+       /* XXX the number of queues is limited to what we can keep stats on */
+       maxq = (sc->hw.mac.type == ixgbe_mac_82598EB) ? 8 : 16;
+
+       sc->sc_intrmap = intrmap_create(&sc->dev, nmsix, maxq, 0);
+       sc->num_queues = intrmap_count(sc->sc_intrmap);
 }
 
 int
@@ -1818,7 +1825,7 @@ ixgbe_allocate_pci_resources(struct ix_s
        sc->hw.back = os;
 
        /* Now setup MSI or MSI/X, return us the number of supported vectors. */
-       sc->msix = ixgbe_setup_msix(sc);
+       ixgbe_setup_msix(sc);
 
        return (0);
 }
@@ -2159,6 +2166,8 @@ ixgbe_allocate_queues(struct ix_softc *s
                que->sc = sc;
                que->txr = &sc->tx_rings[i];
                que->rxr = &sc->rx_rings[i];
+               snprintf(que->name, sizeof(que->name), "%s:%d",
+                   sc->dev.dv_xname, i);
        }
 
        return (0);
@@ -2925,7 +2934,7 @@ ixgbe_initialize_rss_mapping(struct ix_s
        int i, j, queue_id, table_size, index_mult;
 
        /* set up random bits */
-       arc4random_buf(&rss_key, sizeof(rss_key));
+       stoeplitz_to_key(&rss_key, sizeof(rss_key));
 
        /* Set multiplier for RETA setup and table size based on MAC */
        index_mult = 0x1;
@@ -3062,6 +3071,8 @@ ixgbe_rxeof(struct rx_ring *rxr)
 
        i = rxr->next_to_check;
        while (if_rxr_inuse(&rxr->rx_ring) > 0) {
+               uint32_t hash, hashtype;
+
                bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
                    dsize * i, dsize, BUS_DMASYNC_POSTREAD);
 
@@ -3089,6 +3100,9 @@ ixgbe_rxeof(struct rx_ring *rxr)
                    IXGBE_RXDADV_PKTTYPE_MASK;
                vtag = letoh16(rxdesc->wb.upper.vlan);
                eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0);
+               hash = lemtoh32(&rxdesc->wb.lower.hi_dword.rss);
+               hashtype = lemtoh32(&rxdesc->wb.lower.lo_dword.data) &
+                   IXGBE_RXDADV_RSSTYPE_MASK;
 
                if (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) {
                        sc->dropped_pkts++;
@@ -3163,6 +3177,11 @@ ixgbe_rxeof(struct rx_ring *rxr)
 
                        ixgbe_rx_checksum(staterr, sendmp, ptype);
 
+                       if (hashtype != IXGBE_RXDADV_RSSTYPE_NONE) {
+                               sendmp->m_pkthdr.ph_flowid = hash;
+                               SET(sendmp->m_pkthdr.csum_flags, M_FLOWID);
+                       }
+
                        ml_enqueue(&ml, sendmp);
                }
 next_desc:
@@ -3295,7 +3314,7 @@ ixgbe_enable_intr(struct ix_softc *sc)
        IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask);
 
        /* With MSI-X we use auto clear */
-       if (sc->msix > 1) {
+       if (sc->sc_intrmap) {
                mask = IXGBE_EIMS_ENABLE_MASK;
                /* Don't autoclear Link */
                mask &= ~IXGBE_EIMS_OTHER;
@@ -3309,7 +3328,7 @@ ixgbe_enable_intr(struct ix_softc *sc)
 void
 ixgbe_disable_intr(struct ix_softc *sc)
 {
-       if (sc->msix > 1)
+       if (sc->sc_intrmap)
                IXGBE_WRITE_REG(&sc->hw, IXGBE_EIAC, 0);
        if (sc->hw.mac.type == ixgbe_mac_82598EB) {
                IXGBE_WRITE_REG(&sc->hw, IXGBE_EIMC, ~0);
Index: if_ix.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_ix.h,v
retrieving revision 1.41
diff -u -p -r1.41 if_ix.h
--- if_ix.h     7 Jun 2020 23:52:05 -0000       1.41
+++ if_ix.h     19 Jun 2020 05:02:31 -0000
@@ -234,7 +234,7 @@ struct ix_softc {
 
        struct ifmedia          media;
        struct timeout          timer;
-       int                     msix;
+       struct intrmap          *sc_intrmap;
        int                     if_flags;
 
        uint16_t                num_vlans;
Index: ixgbe.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/ixgbe.h,v
retrieving revision 1.29
diff -u -p -r1.29 ixgbe.h
--- ixgbe.h     2 Mar 2020 01:59:01 -0000       1.29
+++ ixgbe.h     19 Jun 2020 05:02:31 -0000
@@ -53,10 +53,12 @@
 #include <sys/timeout.h>
 #include <sys/pool.h>
 #include <sys/rwlock.h>
+#include <sys/intrmap.h>
 #include <sys/atomic.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
+#include <net/toeplitz.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
Index: files.pci
===================================================================
RCS file: /cvs/src/sys/dev/pci/files.pci,v
retrieving revision 1.347
diff -u -p -r1.347 files.pci
--- files.pci   17 Jun 2020 06:36:17 -0000      1.347
+++ files.pci   19 Jun 2020 05:02:31 -0000
@@ -343,7 +343,7 @@ file        dev/pci/ixgb_ee.c               ixgb
 file   dev/pci/ixgb_hw.c               ixgb
 
 # Intel 82598 10GbE
-device ix: ether, ifnet, ifmedia
+device ix: ether, ifnet, ifmedia, intrmap, stoeplitz
 attach ix at pci
 file   dev/pci/if_ix.c                 ix
 file   dev/pci/ixgbe.c                 ix

Reply via email to