Re: ixv(4): porting Virtual Function driver for Intel 82599 series.

2022-12-20 Thread Yuichiro NAITO

I received an issue that ixv(4) doesn't detect linkdowns in personal email.
When linkdown happens, the PF (Primary Function) driver interrupts all VFs
(Virtual Function) via `mailbox`. But ixv(4) doesn't receive the interrupt.

According to NetBSD, this problem has been fixed by following commits.

http://www.nerv.org/netbsd/changeset.cgi?id=20171003T031229Z.85b67885ef5a26fe6778d9f35f3142c5e14d1de8#src/sys/dev/pci/ixgbe/ixv.c

http://www.nerv.org/netbsd/changeset.cgi?id=20171004T110321Z.7b02467efe35f10f476f3239901730fce3bdb494#src/sys/dev/pci/ixgbe/ixv.c

I ported these commits and updated my patch.

diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC
index 3563126b2a1..e421c38e95f 100644
--- a/sys/arch/amd64/conf/GENERIC
+++ b/sys/arch/amd64/conf/GENERIC
@@ -522,6 +522,7 @@ msk*at mskc?#  each port of 
above
 em*at pci? # Intel Pro/1000 ethernet
 ixgb*  at pci? # Intel Pro/10Gb ethernet
 ix*at pci? # Intel 82598EB 10Gb ethernet
+ixv*   at pci? # Virtual Function of Intel 82598EB
 myx*   at pci? # Myricom Myri-10G 10Gb ethernet
 oce*   at pci? # Emulex OneConnect 10Gb ethernet
 txp*   at pci? # 3com 3CR990
diff --git a/sys/dev/pci/files.pci b/sys/dev/pci/files.pci
index a803dc9b659..7d6402e524e 100644
--- a/sys/dev/pci/files.pci
+++ b/sys/dev/pci/files.pci
@@ -350,13 +350,19 @@ file  dev/pci/ixgb_hw.c   ixgb
 # Intel 82598 10GbE
 device ix: ether, ifnet, ifmedia, intrmap, stoeplitz
 attach ix at pci
-file   dev/pci/if_ix.c ix
-file   dev/pci/ixgbe.c ix
-file   dev/pci/ixgbe_82598.c   ix
-file   dev/pci/ixgbe_82599.c   ix
-file   dev/pci/ixgbe_x540.cix
-file   dev/pci/ixgbe_x550.cix
-file   dev/pci/ixgbe_phy.c ix
+file   dev/pci/if_ix.c ix | ixv
+file   dev/pci/ixgbe.c ix | ixv
+file   dev/pci/ixgbe_82598.c   ix | ixv
+file   dev/pci/ixgbe_82599.c   ix | ixv
+file   dev/pci/ixgbe_x540.cix | ixv
+file   dev/pci/ixgbe_x550.cix | ixv
+file   dev/pci/ixgbe_phy.c ix | ixv
+
+# Virtual Function of i82599.
+device ixv: ether, ifnet, ifmedia, intrmap, stoeplitz
+attach ixv at pci
+file   dev/pci/if_ixv.cixv
+file   dev/pci/ixgbe_vf.c  ixv

 # Intel Ethernet 700 Series
 device ixl: ether, ifnet, ifmedia, intrmap, stoeplitz
diff --git a/sys/dev/pci/if_ix.c b/sys/dev/pci/if_ix.c
index b59ec28d9f1..4fb01d85778 100644
--- a/sys/dev/pci/if_ix.c
+++ b/sys/dev/pci/if_ix.c
@@ -507,7 +507,7 @@ ixgbe_start(struct ifqueue *ifq)
 * hardware that this frame is available to transmit.
 */
if (post)
-   IXGBE_WRITE_REG(>hw, IXGBE_TDT(txr->me),
+   IXGBE_WRITE_REG(>hw, txr->tail,
txr->next_avail_desc);
 }

@@ -706,7 +706,7 @@ ixgbe_watchdog(struct ifnet * ifp)
for (i = 0; i < sc->num_queues; i++, txr++) {
printf("%s: Queue(%d) tdh = %d, hw tdt = %d\n", ifp->if_xname, 
i,
IXGBE_READ_REG(hw, IXGBE_TDH(i)),
-   IXGBE_READ_REG(hw, IXGBE_TDT(i)));
+   IXGBE_READ_REG(hw, sc->tx_rings[i].tail));
printf("%s: TX(%d) Next TX to Clean = %d\n", ifp->if_xname,
i, txr->next_to_clean);
}
@@ -826,7 +826,7 @@ ixgbe_init(void *arg)
msec_delay(1);
}
IXGBE_WRITE_FLUSH(>hw);
-   IXGBE_WRITE_REG(>hw, IXGBE_RDT(i), rxr->last_desc_filled);
+   IXGBE_WRITE_REG(>hw, rxr[i].tail, rxr->last_desc_filled);
}

/* Set up VLAN support and filter */
@@ -2359,9 +2359,12 @@ ixgbe_initialize_transmit_units(struct ix_softc *sc)
IXGBE_WRITE_REG(hw, IXGBE_TDLEN(i),
sc->num_tx_desc * sizeof(struct ixgbe_legacy_tx_desc));

+   /* Set Tx Tail register */
+   txr->tail = IXGBE_TDT(i);
+
/* Setup the HW Tx Head and Tail descriptor pointers */
IXGBE_WRITE_REG(hw, IXGBE_TDH(i), 0);
-   IXGBE_WRITE_REG(hw, IXGBE_TDT(i), 0);
+   IXGBE_WRITE_REG(hw, txr->tail, 0);

/* Setup Transmit Descriptor Cmd Settings */
txr->txd_cmd = IXGBE_TXD_CMD_IFCS;
@@ -2834,7 +2837,7 @@ ixgbe_rxrefill(void *xrxr)

if (ixgbe_rxfill(rxr)) {
/* Advance the Rx Queue "Tail Pointer" */
-   IXGBE_WRITE_REG(>hw, IXGBE_RDT(rxr->me),
+   IXGBE_WRITE_REG(>hw, rxr->tail,
rxr->last_desc_filled);
} else if (if_rxr_inuse(>rx_ring) == 0)
timeout_add(>rx_refill, 1);
@@ -2928,6 +2931,9 @@ ixgbe_initialize_receive_units(struct ix_softc *sc)
srrctl = 

Re: ixv(4): porting Virtual Function driver for Intel 82599 series.

2022-12-04 Thread Yuichiro NAITO

I updated my patch to remove AIM code. I can't see any performance improvement
with AIM enabled. I ran packet forwarding test with Cisco TRex traffic
Generator (*1). AIM doesn't improves performance in this test.

*1: https://trex-tgn.cisco.com/

With my latest patch, I've got following maximum packet forwarding rate in
stateless benchmark (stl/bench.py) without packet loss. It starts to drop
packets over these rate.

Stateless benchmark (stl/bench.py)

| N cpu cores | mbps |
|-|--|
|  2  | 3390 |
|  4  | 4250 |
|  6  | 4340 |

Without PF (Packet Filter) improves performance as follows.

| N cpu cores | mbps |
|-|--|
|  2  | 4740 |
|  4  | 7190 |
|  6  | 8410 |

All tests run in MTU 1500. Host hardware spec is shown as below.

Vendor: FUJITSU PRIMERGY RX2530 M2
CPU: 24 CPUs x Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz
Memory: 192 GB
NIC: Intel X540-AT2
ESXi version: 7.0U3

OK?

diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC
index 3563126b2a1..e421c38e95f 100644
--- a/sys/arch/amd64/conf/GENERIC
+++ b/sys/arch/amd64/conf/GENERIC
@@ -522,6 +522,7 @@ msk*at mskc?#  each port of 
above
 em*at pci? # Intel Pro/1000 ethernet
 ixgb*  at pci? # Intel Pro/10Gb ethernet
 ix*at pci? # Intel 82598EB 10Gb ethernet
+ixv*   at pci? # Virtual Function of Intel 82598EB
 myx*   at pci? # Myricom Myri-10G 10Gb ethernet
 oce*   at pci? # Emulex OneConnect 10Gb ethernet
 txp*   at pci? # 3com 3CR990
diff --git a/sys/dev/pci/files.pci b/sys/dev/pci/files.pci
index a803dc9b659..7d6402e524e 100644
--- a/sys/dev/pci/files.pci
+++ b/sys/dev/pci/files.pci
@@ -350,13 +350,19 @@ file  dev/pci/ixgb_hw.c   ixgb
 # Intel 82598 10GbE
 device ix: ether, ifnet, ifmedia, intrmap, stoeplitz
 attach ix at pci
-file   dev/pci/if_ix.c ix
-file   dev/pci/ixgbe.c ix
-file   dev/pci/ixgbe_82598.c   ix
-file   dev/pci/ixgbe_82599.c   ix
-file   dev/pci/ixgbe_x540.cix
-file   dev/pci/ixgbe_x550.cix
-file   dev/pci/ixgbe_phy.c ix
+file   dev/pci/if_ix.c ix | ixv
+file   dev/pci/ixgbe.c ix | ixv
+file   dev/pci/ixgbe_82598.c   ix | ixv
+file   dev/pci/ixgbe_82599.c   ix | ixv
+file   dev/pci/ixgbe_x540.cix | ixv
+file   dev/pci/ixgbe_x550.cix | ixv
+file   dev/pci/ixgbe_phy.c ix | ixv
+
+# Virtual Function of i82599.
+device ixv: ether, ifnet, ifmedia, intrmap, stoeplitz
+attach ixv at pci
+file   dev/pci/if_ixv.cixv
+file   dev/pci/ixgbe_vf.c  ixv

 # Intel Ethernet 700 Series
 device ixl: ether, ifnet, ifmedia, intrmap, stoeplitz
diff --git a/sys/dev/pci/if_ix.c b/sys/dev/pci/if_ix.c
index b59ec28d9f1..4fb01d85778 100644
--- a/sys/dev/pci/if_ix.c
+++ b/sys/dev/pci/if_ix.c
@@ -507,7 +507,7 @@ ixgbe_start(struct ifqueue *ifq)
 * hardware that this frame is available to transmit.
 */
if (post)
-   IXGBE_WRITE_REG(>hw, IXGBE_TDT(txr->me),
+   IXGBE_WRITE_REG(>hw, txr->tail,
txr->next_avail_desc);
 }

@@ -706,7 +706,7 @@ ixgbe_watchdog(struct ifnet * ifp)
for (i = 0; i < sc->num_queues; i++, txr++) {
printf("%s: Queue(%d) tdh = %d, hw tdt = %d\n", ifp->if_xname, 
i,
IXGBE_READ_REG(hw, IXGBE_TDH(i)),
-   IXGBE_READ_REG(hw, IXGBE_TDT(i)));
+   IXGBE_READ_REG(hw, sc->tx_rings[i].tail));
printf("%s: TX(%d) Next TX to Clean = %d\n", ifp->if_xname,
i, txr->next_to_clean);
}
@@ -826,7 +826,7 @@ ixgbe_init(void *arg)
msec_delay(1);
}
IXGBE_WRITE_FLUSH(>hw);
-   IXGBE_WRITE_REG(>hw, IXGBE_RDT(i), rxr->last_desc_filled);
+   IXGBE_WRITE_REG(>hw, rxr[i].tail, rxr->last_desc_filled);
}

/* Set up VLAN support and filter */
@@ -2359,9 +2359,12 @@ ixgbe_initialize_transmit_units(struct ix_softc *sc)
IXGBE_WRITE_REG(hw, IXGBE_TDLEN(i),
sc->num_tx_desc * sizeof(struct ixgbe_legacy_tx_desc));

+   /* Set Tx Tail register */
+   txr->tail = IXGBE_TDT(i);
+
/* Setup the HW Tx Head and Tail descriptor pointers */
IXGBE_WRITE_REG(hw, IXGBE_TDH(i), 0);
-   IXGBE_WRITE_REG(hw, IXGBE_TDT(i), 0);
+   IXGBE_WRITE_REG(hw, txr->tail, 0);

/* Setup Transmit Descriptor Cmd Settings */
txr->txd_cmd = IXGBE_TXD_CMD_IFCS;
@@ -2834,7 +2837,7 @@ ixgbe_rxrefill(void *xrxr)

if (ixgbe_rxfill(rxr)) {
/* Advance the Rx 

Re: ixv(4): porting Virtual Function driver for Intel 82599 series.

2022-11-20 Thread Yuichiro NAITO

Thank you so much for reviewing my patch.

On 11/21/22 08:02, Christian Weisgerber wrote:

Yuichiro NAITO:


+static void
+ixv_set_multi(struct ix_softc *sc)
+{

[...]

+   if ((ifp->if_flags & IFF_PROMISC) == 0 && ac->ac_multirangecnt <= 0 &&
+ ac->ac_multicnt <= MAX_NUM_MULTICAST_ADDRESSES) {
+   ETHER_FIRST_MULTI(step, >arpcom, enm);
+   while (enm != NULL) {
+   bcopy(enm->enm_addrlo,
+ [mcnt * IXGBE_ETH_LENGTH_OF_ADDRESS],
+ IXGBE_ETH_LENGTH_OF_ADDRESS);
+   mcnt++;
+
+   ETHER_NEXT_MULTI(step, enm);
+   }
+
+   update_ptr = mta;
+   sc->hw.mac.ops.update_mc_addr_list(>hw, update_ptr, mcnt,
+  ixv_mc_array_itr, TRUE);
+   }
+
+} /* ixv_set_multi */


This doesn't look right.
There is no handling of ac->ac_multirangecnt > 0 or
mcnt >= MAX_NUM_MULTICAST_ADDRESSES.

Compare ixgb_set_multi() in if_ixgb.c.


Yes, and also 'ixv_set_multi' works similar to 'ixgbe_iff()' in 'if_ix.c'.
Once I found VF cannot write FCTL register, I removed IXGBE_WRITE_REG macro
and 'fctrl' value. That was mistake.

Writing FCTL register should be replaced by calling 'update_xcast_mode'
function to tell PF which multicast addresses are used.

I updated 'ixv_set_multi' and catch the 'ac->ac_multirangecnt > 0 or
mcnt >= MAX_NUM_MULTICAST_ADDRESSES' case.


+int32_t ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, uint8_t 
*mc_addr_list,
+uint32_t mc_addr_count, ixgbe_mc_addr_itr 
next,
+bool clear)
+{

[...]

+   /* Each entry in the list uses 1 16 bit word.  We have 30
+* 16 bit words available in our HW msg buffer (minus 1 for the
+* msg type).  That's 30 hash values if we pack 'em right.  If
+* there are more than 30 MC addresses to add then punt the
+* extras for now and then add code to handle more than 30 later.
+* It would be unusual for a server to request that many multi-cast
+* addresses except for in large enterprise network environments.
+*/
+
+   DEBUGOUT1("MC Addr Count = %d\n", mc_addr_count);
+
+   cnt = (mc_addr_count > 30) ? 30 : mc_addr_count;


Should MAX_NUM_MULTICAST_ADDRESSES simply be set to 30?


MAX_NUM_MULTICAST_ADDRESSES is used for ix(4) driver.
So, I introduce IXGBE_MAX_MULTICAST_ADDRESSES_VF as 30.

Increasing this number will never work as intended.
Because multicast addresses are sent to PF (Primary Function) driver via 
'mailbox'.
PF drivers doesn't allow over 30 multicast addresses for now.

For instance, if Linux qemu hosts ixgbe SR-IOV, following Linux driver code
handles 'update_mc_addr_list' request.

drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c:
 362static int ixgbe_set_vf_multicasts(struct ixgbe_adapter *adapter,
 363   u32 *msgbuf, u32 vf)
 364{
 365int entries = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK)
 366   >> IXGBE_VT_MSGINFO_SHIFT;
 367u16 *hash_list = (u16 *)[1];
 368struct vf_data_storage *vfinfo = >vfinfo[vf];
 369struct ixgbe_hw *hw = >hw;
 370int i;
 371u32 vector_bit;
 372u32 vector_reg;
 373u32 mta_reg;
 374u32 vmolr = IXGBE_READ_REG(hw, IXGBE_VMOLR(vf));
 375
 376/* only so many hash values supported */
 377entries = min(entries, IXGBE_MAX_VF_MC_ENTRIES);

Number of multicast addresses is limited by IXGBE_MAX_VF_MC_ENTRIES(= 30).

I'm not sure about ESXi host, because I don't have an access to ESXi source 
code.
But I'm guessing it has the same limitation like this.

And I've received an another issue about FreeBSD VCS ID.
$FreeBSD$ has completely no meaning now.
I also removed it.

Here is my updated ixv patch.

diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC
index 3563126b2a1..e421c38e95f 100644
--- a/sys/arch/amd64/conf/GENERIC
+++ b/sys/arch/amd64/conf/GENERIC
@@ -522,6 +522,7 @@ msk*at mskc?#  each port of 
above
 em*at pci? # Intel Pro/1000 ethernet
 ixgb*  at pci? # Intel Pro/10Gb ethernet
 ix*at pci? # Intel 82598EB 10Gb ethernet
+ixv*   at pci? # Virtual Function of Intel 82598EB
 myx*   at pci? # Myricom Myri-10G 10Gb ethernet
 oce*   at pci? # Emulex OneConnect 10Gb ethernet
 txp*   at pci? # 3com 3CR990
diff --git a/sys/dev/pci/files.pci b/sys/dev/pci/files.pci
index a803dc9b659..7d6402e524e 100644
--- a/sys/dev/pci/files.pci
+++ b/sys/dev/pci/files.pci
@@ -350,13 +350,19 @@ file  dev/pci/ixgb_hw.c   ixgb
 # Intel 82598 10GbE
 device ix: ether, ifnet, ifmedia, intrmap, stoeplitz
 attach ix at pci
-file   

Re: ixv(4): porting Virtual Function driver for Intel 82599 series.

2022-11-20 Thread Christian Weisgerber
Yuichiro NAITO:

> +static void
> +ixv_set_multi(struct ix_softc *sc)
> +{
[...]
> + if ((ifp->if_flags & IFF_PROMISC) == 0 && ac->ac_multirangecnt <= 0 &&
> +   ac->ac_multicnt <= MAX_NUM_MULTICAST_ADDRESSES) {
> + ETHER_FIRST_MULTI(step, >arpcom, enm);
> + while (enm != NULL) {
> + bcopy(enm->enm_addrlo,
> +   [mcnt * IXGBE_ETH_LENGTH_OF_ADDRESS],
> +   IXGBE_ETH_LENGTH_OF_ADDRESS);
> + mcnt++;
> +
> + ETHER_NEXT_MULTI(step, enm);
> + }
> +
> + update_ptr = mta;
> + sc->hw.mac.ops.update_mc_addr_list(>hw, update_ptr, mcnt,
> +ixv_mc_array_itr, TRUE);
> + }
> +
> +} /* ixv_set_multi */

This doesn't look right.
There is no handling of ac->ac_multirangecnt > 0 or
mcnt >= MAX_NUM_MULTICAST_ADDRESSES.

Compare ixgb_set_multi() in if_ixgb.c.

> +int32_t ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, uint8_t 
> *mc_addr_list,
> +  uint32_t mc_addr_count, ixgbe_mc_addr_itr 
> next,
> +  bool clear)
> +{
[...]
> + /* Each entry in the list uses 1 16 bit word.  We have 30
> +  * 16 bit words available in our HW msg buffer (minus 1 for the
> +  * msg type).  That's 30 hash values if we pack 'em right.  If
> +  * there are more than 30 MC addresses to add then punt the
> +  * extras for now and then add code to handle more than 30 later.
> +  * It would be unusual for a server to request that many multi-cast
> +  * addresses except for in large enterprise network environments.
> +  */
> +
> + DEBUGOUT1("MC Addr Count = %d\n", mc_addr_count);
> +
> + cnt = (mc_addr_count > 30) ? 30 : mc_addr_count;

Should MAX_NUM_MULTICAST_ADDRESSES simply be set to 30?

-- 
Christian "naddy" Weisgerber  na...@mips.inka.de