Re: em(4) towards multiqueues

2020-02-18 Thread Martin Pieuchot
On 18/02/20(Tue) 22:39, Jonathan Matthew wrote:
> On Fri, Feb 14, 2020 at 06:28:20PM +0100, Martin Pieuchot wrote:
> > @@ -2597,13 +2635,6 @@ em_initialize_receive_unit(struct em_sof
> > E1000_WRITE_REG(>hw, ITR, DEFAULT_ITR);
> > }
> >  
> > -   /* Setup the Base and Length of the Rx Descriptor Ring */
> > -   bus_addr = sc->sc_rx_dma.dma_map->dm_segs[0].ds_addr;
> > -   E1000_WRITE_REG(>hw, RDLEN(0),
> > -   sc->sc_rx_slots * sizeof(*sc->sc_rx_desc_ring));
> > -   E1000_WRITE_REG(>hw, RDBAH(0), (u_int32_t)(bus_addr >> 32));
> > -   E1000_WRITE_REG(>hw, RDBAL(0), (u_int32_t)bus_addr);
> > -
> > /* Setup the Receive Control Register */
> > reg_rctl = E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
> > E1000_RCTL_RDMTS_HALF |
> > @@ -2653,6 +2684,13 @@ em_initialize_receive_unit(struct em_sof
> > if (sc->hw.mac_type == em_82573)
> > E1000_WRITE_REG(>hw, RDTR, 0x20);
> >  
> > +   /* Setup the Base and Length of the Rx Descriptor Ring */
> > +   bus_addr = que->rx.sc_rx_dma.dma_map->dm_segs[0].ds_addr;
> > +   E1000_WRITE_REG(>hw, RDLEN(0),
> > +   sc->sc_rx_slots * sizeof(*que->rx.sc_rx_desc_ring));
> > +   E1000_WRITE_REG(>hw, RDBAH(0), (u_int32_t)(bus_addr >> 32));
> > +   E1000_WRITE_REG(>hw, RDBAL(0), (u_int32_t)bus_addr);
> > +
> 
> Just to make sure that I follow how this works, should we be setting
> RDLEN(que->me) rather than RDLEN(0) here, and likewise for RDBAH and RDBAL?
> This value is always 0 at this stage, so of course it doesn't matter yet.

Nice catch!  Diff below fixes these :o)

Index: dev/pci/if_em.c
===
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.344
diff -u -p -r1.344 if_em.c
--- dev/pci/if_em.c 4 Feb 2020 10:59:23 -   1.344
+++ dev/pci/if_em.c 18 Feb 2020 13:04:19 -
@@ -257,25 +257,25 @@ void em_free_transmit_structures(struct 
 void em_free_receive_structures(struct em_softc *);
 void em_update_stats_counters(struct em_softc *);
 void em_disable_aspm(struct em_softc *);
-void em_txeof(struct em_softc *);
+void em_txeof(struct em_queue *);
 int  em_allocate_receive_structures(struct em_softc *);
 int  em_allocate_transmit_structures(struct em_softc *);
 int  em_allocate_desc_rings(struct em_softc *);
-int  em_rxfill(struct em_softc *);
+int  em_rxfill(struct em_queue *);
 void em_rxrefill(void *);
-int  em_rxeof(struct em_softc *);
+int  em_rxeof(struct em_queue *);
 void em_receive_checksum(struct em_softc *, struct em_rx_desc *,
 struct mbuf *);
-u_int  em_transmit_checksum_setup(struct em_softc *, struct mbuf *, u_int,
+u_int  em_transmit_checksum_setup(struct em_queue *, struct mbuf *, u_int,
u_int32_t *, u_int32_t *);
 void em_iff(struct em_softc *);
 #ifdef EM_DEBUG
 void em_print_hw_stats(struct em_softc *);
 #endif
 void em_update_link_status(struct em_softc *);
-int  em_get_buf(struct em_softc *, int);
+int  em_get_buf(struct em_queue *, int);
 void em_enable_hw_vlans(struct em_softc *);
-u_int em_encap(struct em_softc *, struct mbuf *);
+u_int em_encap(struct em_queue *, struct mbuf *);
 void em_smartspeed(struct em_softc *);
 int  em_82547_fifo_workaround(struct em_softc *, int);
 void em_82547_update_fifo_head(struct em_softc *, int);
@@ -286,8 +286,8 @@ int  em_dma_malloc(struct em_softc *, bu
 void em_dma_free(struct em_softc *, struct em_dma_alloc *);
 u_int32_t em_fill_descriptors(u_int64_t address, u_int32_t length,
  PDESC_ARRAY desc_array);
-void em_flush_tx_ring(struct em_softc *);
-void em_flush_rx_ring(struct em_softc *);
+void em_flush_tx_ring(struct em_queue *);
+void em_flush_rx_ring(struct em_queue *);
 void em_flush_desc_rings(struct em_softc *);
 
 /*
@@ -383,7 +383,6 @@ em_attach(struct device *parent, struct 
 
timeout_set(>timer_handle, em_local_timer, sc);
timeout_set(>tx_fifo_timer_handle, em_82547_move_tail, sc);
-   timeout_set(>rx_refill, em_rxrefill, sc);
 
/* Determine hardware revision */
em_identify_hardware(sc);
@@ -601,6 +600,7 @@ em_start(struct ifqueue *ifq)
u_int head, free, used;
struct mbuf *m;
int post = 0;
+   struct em_queue *que = sc->queues; /* Use only first queue. */
 
if (!sc->link_active) {
ifq_purge(ifq);
@@ -608,15 +608,15 @@ em_start(struct ifqueue *ifq)
}
 
/* calculate free space */
-   head = sc->sc_tx_desc_head;
-   free = sc->sc_tx_desc_tail;
+   head = que->tx.sc_tx_desc_head;
+   free = que->tx.sc_tx_desc_tail;
if (free <= head)
free += sc->sc_tx_slots;
free -= head;
 
if (sc->hw.mac_type != em_82547) {
-   bus_dmamap_sync(sc->sc_dmat, sc->sc_tx_dma.dma_map,
-   0, sc->sc_tx_dma.dma_map->dm_mapsize,
+   bus_dmamap_sync(sc->sc_dmat, 

Re: em(4) towards multiqueues

2020-02-18 Thread Jonathan Matthew
On Fri, Feb 14, 2020 at 06:28:20PM +0100, Martin Pieuchot wrote:
> This diff introduces the concept of "queue" in the em(4) driver.  The
> logic present in ix(4) has been matched for coherency.
> 
> Currently the driver uses a single queue and the diff below doesn't
> change anything in that regard.  It can be viewed as the introduction
> of an abstraction.
> 
> I'd like to get this in to reduce the differences between em(4) and
> ix(4) when it comes to multiqueues.  My intend is to keep the upcoming
> changes as small as possible such that we can concentrate our efforts on
> the important bits.  So far we started discussing the interaction between
> CPU and mapping MSIX vectors but other related topics will appear :o)
> 
> I'm running this on:
> 
>   em0 at pci1 dev 0 function 0 "Intel I210" rev 0x03: msi
>   em0 at pci0 dev 20 function 0 "Intel I354 SGMII" rev 0x03: msi
> 
> More tests are always welcome ;)

I can test this on some other hardware tomorrow.

I read through it and it all makes sense to me.  One question below,
but ok jmatthew@

> @@ -2597,13 +2635,6 @@ em_initialize_receive_unit(struct em_sof
>   E1000_WRITE_REG(>hw, ITR, DEFAULT_ITR);
>   }
>  
> - /* Setup the Base and Length of the Rx Descriptor Ring */
> - bus_addr = sc->sc_rx_dma.dma_map->dm_segs[0].ds_addr;
> - E1000_WRITE_REG(>hw, RDLEN(0),
> - sc->sc_rx_slots * sizeof(*sc->sc_rx_desc_ring));
> - E1000_WRITE_REG(>hw, RDBAH(0), (u_int32_t)(bus_addr >> 32));
> - E1000_WRITE_REG(>hw, RDBAL(0), (u_int32_t)bus_addr);
> -
>   /* Setup the Receive Control Register */
>   reg_rctl = E1000_RCTL_EN | E1000_RCTL_BAM | E1000_RCTL_LBM_NO |
>   E1000_RCTL_RDMTS_HALF |
> @@ -2653,6 +2684,13 @@ em_initialize_receive_unit(struct em_sof
>   if (sc->hw.mac_type == em_82573)
>   E1000_WRITE_REG(>hw, RDTR, 0x20);
>  
> + /* Setup the Base and Length of the Rx Descriptor Ring */
> + bus_addr = que->rx.sc_rx_dma.dma_map->dm_segs[0].ds_addr;
> + E1000_WRITE_REG(>hw, RDLEN(0),
> + sc->sc_rx_slots * sizeof(*que->rx.sc_rx_desc_ring));
> + E1000_WRITE_REG(>hw, RDBAH(0), (u_int32_t)(bus_addr >> 32));
> + E1000_WRITE_REG(>hw, RDBAL(0), (u_int32_t)bus_addr);
> +

Just to make sure that I follow how this works, should we be setting
RDLEN(que->me) rather than RDLEN(0) here, and likewise for RDBAH and RDBAL?
This value is always 0 at this stage, so of course it doesn't matter yet.



Re: em(4) towards multiqueues

2020-02-16 Thread Hrvoje Popovski
On 14.2.2020. 18:28, Martin Pieuchot wrote:
> I'm running this on:
> 
>   em0 at pci1 dev 0 function 0 "Intel I210" rev 0x03: msi
>   em0 at pci0 dev 20 function 0 "Intel I354 SGMII" rev 0x03: msi
> 
> More tests are always welcome ;)


em0 at pci0 dev 25 function 0 "Intel 82579LM" rev 0x04: msi
em1 at pci1 dev 0 function 1 "Intel 82571EB" rev 0x06: apic 0 int 17
em4 at pci2 dev 0 function 1 "Intel 82576" rev 0x01: msi
em5 at pci3 dev 0 function 0 "Intel 82572EI" rev 0x06: apic 0 int 16
em0 at pci9 dev 0 function 0 "Intel I350" rev 0x01: msi

it just works .. :)




em(4) towards multiqueues

2020-02-14 Thread Martin Pieuchot
This diff introduces the concept of "queue" in the em(4) driver.  The
logic present in ix(4) has been matched for coherency.

Currently the driver uses a single queue and the diff below doesn't
change anything in that regard.  It can be viewed as the introduction
of an abstraction.

I'd like to get this in to reduce the differences between em(4) and
ix(4) when it comes to multiqueues.  My intend is to keep the upcoming
changes as small as possible such that we can concentrate our efforts on
the important bits.  So far we started discussing the interaction between
CPU and mapping MSIX vectors but other related topics will appear :o)

I'm running this on:

em0 at pci1 dev 0 function 0 "Intel I210" rev 0x03: msi
em0 at pci0 dev 20 function 0 "Intel I354 SGMII" rev 0x03: msi

More tests are always welcome ;)

Index: dev/pci/if_em.c
===
RCS file: /cvs/src/sys/dev/pci/if_em.c,v
retrieving revision 1.344
diff -u -p -r1.344 if_em.c
--- dev/pci/if_em.c 4 Feb 2020 10:59:23 -   1.344
+++ dev/pci/if_em.c 14 Feb 2020 17:13:06 -
@@ -257,25 +257,25 @@ void em_free_transmit_structures(struct 
 void em_free_receive_structures(struct em_softc *);
 void em_update_stats_counters(struct em_softc *);
 void em_disable_aspm(struct em_softc *);
-void em_txeof(struct em_softc *);
+void em_txeof(struct em_queue *);
 int  em_allocate_receive_structures(struct em_softc *);
 int  em_allocate_transmit_structures(struct em_softc *);
 int  em_allocate_desc_rings(struct em_softc *);
-int  em_rxfill(struct em_softc *);
+int  em_rxfill(struct em_queue *);
 void em_rxrefill(void *);
-int  em_rxeof(struct em_softc *);
+int  em_rxeof(struct em_queue *);
 void em_receive_checksum(struct em_softc *, struct em_rx_desc *,
 struct mbuf *);
-u_int  em_transmit_checksum_setup(struct em_softc *, struct mbuf *, u_int,
+u_int  em_transmit_checksum_setup(struct em_queue *, struct mbuf *, u_int,
u_int32_t *, u_int32_t *);
 void em_iff(struct em_softc *);
 #ifdef EM_DEBUG
 void em_print_hw_stats(struct em_softc *);
 #endif
 void em_update_link_status(struct em_softc *);
-int  em_get_buf(struct em_softc *, int);
+int  em_get_buf(struct em_queue *, int);
 void em_enable_hw_vlans(struct em_softc *);
-u_int em_encap(struct em_softc *, struct mbuf *);
+u_int em_encap(struct em_queue *, struct mbuf *);
 void em_smartspeed(struct em_softc *);
 int  em_82547_fifo_workaround(struct em_softc *, int);
 void em_82547_update_fifo_head(struct em_softc *, int);
@@ -286,8 +286,8 @@ int  em_dma_malloc(struct em_softc *, bu
 void em_dma_free(struct em_softc *, struct em_dma_alloc *);
 u_int32_t em_fill_descriptors(u_int64_t address, u_int32_t length,
  PDESC_ARRAY desc_array);
-void em_flush_tx_ring(struct em_softc *);
-void em_flush_rx_ring(struct em_softc *);
+void em_flush_tx_ring(struct em_queue *);
+void em_flush_rx_ring(struct em_queue *);
 void em_flush_desc_rings(struct em_softc *);
 
 /*
@@ -383,7 +383,6 @@ em_attach(struct device *parent, struct 
 
timeout_set(>timer_handle, em_local_timer, sc);
timeout_set(>tx_fifo_timer_handle, em_82547_move_tail, sc);
-   timeout_set(>rx_refill, em_rxrefill, sc);
 
/* Determine hardware revision */
em_identify_hardware(sc);
@@ -601,6 +600,7 @@ em_start(struct ifqueue *ifq)
u_int head, free, used;
struct mbuf *m;
int post = 0;
+   struct em_queue *que = sc->queues; /* Use only first queue. */
 
if (!sc->link_active) {
ifq_purge(ifq);
@@ -608,15 +608,15 @@ em_start(struct ifqueue *ifq)
}
 
/* calculate free space */
-   head = sc->sc_tx_desc_head;
-   free = sc->sc_tx_desc_tail;
+   head = que->tx.sc_tx_desc_head;
+   free = que->tx.sc_tx_desc_tail;
if (free <= head)
free += sc->sc_tx_slots;
free -= head;
 
if (sc->hw.mac_type != em_82547) {
-   bus_dmamap_sync(sc->sc_dmat, sc->sc_tx_dma.dma_map,
-   0, sc->sc_tx_dma.dma_map->dm_mapsize,
+   bus_dmamap_sync(sc->sc_dmat, que->tx.sc_tx_dma.dma_map,
+   0, que->tx.sc_tx_dma.dma_map->dm_mapsize,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
}
 
@@ -631,7 +631,7 @@ em_start(struct ifqueue *ifq)
if (m == NULL)
break;
 
-   used = em_encap(sc, m);
+   used = em_encap(que, m);
if (used == 0) {
m_freem(m);
continue;
@@ -656,8 +656,8 @@ em_start(struct ifqueue *ifq)
if (sc->link_duplex == HALF_DUPLEX)
em_82547_move_tail_locked(sc);
else {
-   E1000_WRITE_REG(>hw, TDT(0),
-