[PATCH net] bnx2x: Improve reliability in case of nested PCI errors

2017-12-22 Thread Guilherme G. Piccoli
While in recovery process of PCI error (called EEH on PowerPC arch),
another PCI transaction could be corrupted causing a situation of
nested PCI errors. Also, this scenario could be reproduced with
error injection mechanisms (for debug purposes).

We observe that in case of nested PCI errors, bnx2x might attempt to
initialize its shmem and cause a kernel crash due to bad addresses
read from MCP. Multiple different stack traces were observed depending
on the point the second PCI error happens.

This patch avoids the crashes by:

 * failing PCI recovery in case of nested errors (since multiple
 PCI errors in a row are not expected to lead to a functional
 adapter anyway), and by,

 * preventing access to adapter FW when MCP is failed (we mark it as
 failed when shmem cannot get initialized properly).

Reported-by: Abdul Haleem <abdha...@linux.vnet.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 14 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 4c739d5355d2..8ae269ec17a1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3030,7 +3030,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, 
bool keep_link)
 
del_timer_sync(>timer);
 
-   if (IS_PF(bp)) {
+   if (IS_PF(bp) && !BP_NOMCP(bp)) {
/* Set ALWAYS_ALIVE bit in shmem */
bp->fw_drv_pulse_wr_seq |= DRV_PULSE_ALWAYS_ALIVE;
bnx2x_drv_pulse(bp);
@@ -3116,7 +3116,7 @@ int bnx2x_nic_unload(struct bnx2x *bp, int unload_mode, 
bool keep_link)
bp->cnic_loaded = false;
 
/* Clear driver version indication in shmem */
-   if (IS_PF(bp))
+   if (IS_PF(bp) && !BP_NOMCP(bp))
bnx2x_update_mng_version(bp);
 
/* Check if there are pending parity attentions. If there are - set
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 91e2a7560b48..ddd5d3ebd201 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -9578,6 +9578,15 @@ static int bnx2x_init_shmem(struct bnx2x *bp)
 
do {
bp->common.shmem_base = REG_RD(bp, MISC_REG_SHARED_MEM_ADDR);
+
+   /* If we read all 0xFFs, means we are in PCI error state and
+* should bail out to avoid crashes on adapter's FW reads.
+*/
+   if (bp->common.shmem_base == 0x) {
+   bp->flags |= NO_MCP_FLAG;
+   return -ENODEV;
+   }
+
if (bp->common.shmem_base) {
val = SHMEM_RD(bp, validity_map[BP_PORT(bp)]);
if (val & SHR_MEM_VALIDITY_MB)
@@ -14320,7 +14329,10 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct 
pci_dev *pdev)
BNX2X_ERR("IO slot reset --> driver unload\n");
 
/* MCP should have been reset; Need to wait for validity */
-   bnx2x_init_shmem(bp);
+   if (bnx2x_init_shmem(bp)) {
+   rtnl_unlock();
+   return PCI_ERS_RESULT_DISCONNECT;
+   }
 
if (IS_PF(bp) && SHMEM2_HAS(bp, drv_capabilities_flag)) {
u32 v;
-- 
2.15.0



[PATCH net] cxgb4: fix BUG() on interrupt deallocating path of ULD

2017-07-10 Thread Guilherme G. Piccoli
Since the introduction of ULD (Upper-Layer Drivers), the MSI-X
deallocating path changed in cxgb4: the driver frees the interrupts
of ULD when unregistering it or on shutdown PCI handler.

Problem is that if a MSI-X is not freed before deallocated in the PCI
layer, it will trigger a BUG() due to still "alive" interrupt being
tentatively quiesced.

The below trace was observed when doing a simple unbind of Chelsio's
adapter PCI function, like:
  "echo 001e:80:00.4 > /sys/bus/pci/drivers/cxgb4/unbind"

Trace:

  kernel BUG at drivers/pci/msi.c:352!
  Oops: Exception in kernel mode, sig: 5 [#1]
  ...
  NIP [c05a5e60] free_msi_irqs+0xa0/0x250
  LR [c05a5e50] free_msi_irqs+0x90/0x250
  Call Trace:
  [c05a5e50] free_msi_irqs+0x90/0x250 (unreliable)
  [c05a72c4] pci_disable_msix+0x124/0x180
  [d00011e06708] disable_msi+0x88/0xb0 [cxgb4]
  [d00011e06948] free_some_resources+0xa8/0x160 [cxgb4]
  [d00011e06d60] remove_one+0x170/0x3c0 [cxgb4]
  [c058a910] pci_device_remove+0x70/0x110
  [c064ef04] device_release_driver_internal+0x1f4/0x2c0
  ...

This patch fixes the issue by refactoring the shutdown path of ULD on
cxgb4 driver, by properly freeing and disabling interrupts on PCI
remove handler too.

Fixes: 0fbc81b3ad51 ("Allocate resources dynamically for all cxgb4 ULD's")
Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---

It might be important to add this patch to stable too if possible, v4.9+ .

 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 16 +++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c  | 42 +++--
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 86f92e31e8aa..e403fa18f1b1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2083,12 +2083,12 @@ static void detach_ulds(struct adapter *adap)
 
mutex_lock(_mutex);
list_del(>list_node);
+
for (i = 0; i < CXGB4_ULD_MAX; i++)
-   if (adap->uld && adap->uld[i].handle) {
+   if (adap->uld && adap->uld[i].handle)
adap->uld[i].state_change(adap->uld[i].handle,
 CXGB4_STATE_DETACH);
-   adap->uld[i].handle = NULL;
-   }
+
if (netevent_registered && list_empty(_list)) {
unregister_netevent_notifier(_netevent_nb);
netevent_registered = false;
@@ -5303,8 +5303,10 @@ static void remove_one(struct pci_dev *pdev)
 */
destroy_workqueue(adapter->workq);
 
-   if (is_uld(adapter))
+   if (is_uld(adapter)) {
detach_ulds(adapter);
+   t4_uld_clean_up(adapter);
+   }
 
disable_interrupts(adapter);
 
@@ -5385,7 +5387,11 @@ static void shutdown_one(struct pci_dev *pdev)
if (adapter->port[i]->reg_state == NETREG_REGISTERED)
cxgb_close(adapter->port[i]);
 
-   t4_uld_clean_up(adapter);
+   if (is_uld(adapter)) {
+   detach_ulds(adapter);
+   t4_uld_clean_up(adapter);
+   }
+
disable_interrupts(adapter);
disable_msi(adapter);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index ec53fe9dec68..71a315bc1409 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -589,22 +589,37 @@ void t4_uld_mem_free(struct adapter *adap)
kfree(adap->uld);
 }
 
+/* This function should be called with uld_mutex taken. */
+static void cxgb4_shutdown_uld_adapter(struct adapter *adap, enum cxgb4_uld 
type)
+{
+   if (adap->uld[type].handle) {
+   adap->uld[type].handle = NULL;
+   adap->uld[type].add = NULL;
+   release_sge_txq_uld(adap, type);
+
+   if (adap->flags & FULL_INIT_DONE)
+   quiesce_rx_uld(adap, type);
+
+   if (adap->flags & USING_MSIX)
+   free_msix_queue_irqs_uld(adap, type);
+
+   free_sge_queues_uld(adap, type);
+   free_queues_uld(adap, type);
+   }
+}
+
 void t4_uld_clean_up(struct adapter *adap)
 {
unsigned int i;
 
-   if (!adap->uld)
-   return;
+   mutex_lock(_mutex);
for (i = 0; i < CXGB4_ULD_MAX; i++) {
if (!adap->uld[i].handle)
continue;
-   if (adap->flags & FULL_INIT

[PATCH net] cxgb4: avoid crash on PCI error recovery path

2017-05-28 Thread Guilherme G. Piccoli
During PCI error recovery process, specifically on eeh_err_detected()
we might have a NULL netdev struct, hence a direct dereference will
lead to a kernel oops. This was observed with latest upstream kernel
(v4.12-rc2) on Chelsio adapter T422-CR in PowerPC machines.

This patch checks for NULL pointer and avoids the crash, both in
eeh_err_detected() and eeh_resume(). Also, we avoid to trigger
a fatal error or to try disabling interrupts on FW during PCI
error recovery, because: (a) driver might not be able to accurately
access PCI regions in this case, and (b) trigger a fatal error
_during_ the recovery steps is a mistake that could prevent the
recovery path to complete successfully.

Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 21 +
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c  |  9 +++--
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 38a5c6764bb5..b512149684fd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2771,6 +2771,9 @@ void t4_fatal_err(struct adapter *adap)
 {
int port;
 
+   if (pci_channel_offline(adap->pdev))
+   return;
+
/* Disable the SGE since ULDs are going to free resources that
 * could be exposed to the adapter.  RDMA MWs for example...
 */
@@ -3882,9 +3885,10 @@ static pci_ers_result_t eeh_err_detected(struct pci_dev 
*pdev,
spin_lock(>stats_lock);
for_each_port(adap, i) {
struct net_device *dev = adap->port[i];
-
-   netif_device_detach(dev);
-   netif_carrier_off(dev);
+   if (dev) {
+   netif_device_detach(dev);
+   netif_carrier_off(dev);
+   }
}
spin_unlock(>stats_lock);
disable_interrupts(adap);
@@ -3963,12 +3967,13 @@ static void eeh_resume(struct pci_dev *pdev)
rtnl_lock();
for_each_port(adap, i) {
struct net_device *dev = adap->port[i];
-
-   if (netif_running(dev)) {
-   link_start(dev);
-   cxgb_set_rxmode(dev);
+   if (dev) {
+   if (netif_running(dev)) {
+   link_start(dev);
+   cxgb_set_rxmode(dev);
+   }
+   netif_device_attach(dev);
}
-   netif_device_attach(dev);
}
rtnl_unlock();
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index aded42b96f6d..3a34aa629f7d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -4557,8 +4557,13 @@ void t4_intr_enable(struct adapter *adapter)
  */
 void t4_intr_disable(struct adapter *adapter)
 {
-   u32 whoami = t4_read_reg(adapter, PL_WHOAMI_A);
-   u32 pf = CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5 ?
+   u32 whoami, pf;
+
+   if (pci_channel_offline(adapter->pdev))
+   return;
+
+   whoami = t4_read_reg(adapter, PL_WHOAMI_A);
+   pf = CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5 ?
SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami);
 
t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE_A), 0);
-- 
2.12.0.rc0



Re: i40e: driver can't probe device (capabilities discovery error)

2017-02-08 Thread Guilherme G. Piccoli
I forgot to attach a perhaps important file, the output of lspci.
Here it goes.

Thanks,


Guilherme
0002:01:00.0 Ethernet controller [0200]: Intel Corporation Ethernet Controller 
X710/X557-AT 10GBASE-T [8086:1589] (rev 02)
Subsystem: Super Micro Computer Inc Device [15d9:]
Control: I/O- Mem+ BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr+ 
Stepping- SERR+ FastB2B- DisINTx-
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- SERR- TAbort- SERR- TAbort- SERR- TAbort- SERR- 

i40e: driver can't probe device (capabilities discovery error)

2017-02-08 Thread Guilherme G. Piccoli
Recently we had a sudden fail on Intel XL710 adapter, in which the i40e
driver is not able to probe the device anymore - it fails right on the
beginning of the probe process, on discovery capabilities procedure. We
observed the following messages on kernel (v4.10-rc7) log:


i40e: Intel(R) Ethernet Connection XL710 Network Driver - version 1.6.25-k
i40e: Copyright (c) 2013 - 2014 Intel Corporation.
i40e 0002:01:00.0: Using 64-bit DMA iommu bypass
i40e 0002:01:00.0: fw 5.1.40981 api 1.5 nvm 5.03 0x80002469 1.1313.0
i40e 0002:01:00.0: capability discovery failed, err OK aq_err
I40E_AQ_RC_EMODE
i40e 0002:01:00.1: Using 64-bit DMA iommu bypass
i40e 0002:01:00.1: fw 5.1.40981 api 1.5 nvm 5.03 0x80002469 1.1313.0
i40e 0002:01:00.1: capability discovery failed, err OK aq_err
I40E_AQ_RC_EMODE




We were able to "revive" the adapter using one of the following 2
procedures:

i) PowerPC systems have a feature called EEH, that is a PCI slot reset
in essence. It's something in HW/PHB level, so the mechanism does a slot
reset, that can be a PCI Hot Reset or Fundamental Reset (PERST).

The 1st way to recover the adapter was to inject an error on this slot
and forcing a called "hotplug recovery". Basically, we removed the
adapter from the PCI core (echo 1 >
/sys/bus/pci/devices/0002:01:00.*/remove), then we froze the PHB
transactions (using a debug facility on powerpc kernel) and then we do a
rescan on PCI bus (echo 1 > /sys/bus/pci/rescan).

This led to Hot Reset on slot, and adapter recovered fine, i40e driver
was able to complete the probe procedure. I can provide full logs if
desired.
Although I think this is too hacky way...

ii) With the attached patch, we were able to "partially" circumvent the
issue. Basically, the probe procedure worked fine to all device
functions, but on function 3 we failed in eeprom check - the following
messages were observed in the kernel log:

[29.1126] i40e 0002:01:00.3: Using 64-bit DMA iommu bypass
[32.3530] i40e 0002:01:00.3: fw 5.1.40981 api 1.5 nvm 5.03 0x24695003
192.0.63
[32.8441] i40e 0002:01:00.3: eeprom check failed (-2), Tx/Rx traffic
disabled
[32.8583] i40e 0002:01:00.3: MAC address: 0c:c4:7a:89:f1:c3
[32.8712] i40e 0002:01:00.3: MSI-X vector limit reached, attempting to
redistribute vectors
[32.9765] i40e 0002:01:00.3: Added LAN device PF3 bus=0x00 func=0x03
[32.9766] i40e 0002:01:00.3: PCI-Express: Speed 8.0GT/s Width x8
[32.9867] i40e 0002:01:00.3: Features: PF-id[3] VFs: 32 VSIs: 34 QP: 119
RSS FD_ATR FD_SB NTUPLE DCB VxLAN Geneve PTP VEPA


All the other 3 functions presented the same messages except the eeprom
check failed.
I'm aware the patch needs some rework (in my understanding, the logic
works only to a single adapter, because we need a global reset only in
one function of the adapter. But the patch logic fails if we have more
than 1 physical adapter on machine. It's just a draft/RFC version for now).
--

So, I'd like to request help/feedback from you regarding what's going
on. I'm not sure the root cause of the sudden adapter failure. In one
day it was fine, and in the other, after a machine reboot, it entered in
this odd state. We have 2 machines presenting this behavior and 5 others
that are fine.

Is there a way to clear this bad state on the adapter, like a special
reset (or even a jumper that we should play physically)? I tried EMP
reset too, but seems it's not allowed for some reason (perhaps only in
NVM update mode? Not sure). Also, any pointers on how to understand the
root cause are welcome.
Thanks in advance,


Guilherme
>From 1a49e453816dbab747788b87f9d03edc978cb50b Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpicc...@linux.vnet.ibm.com>
Date: Tue, 7 Feb 2017 17:38:04 -0200
Subject: [PATCH] i40: force global reset on adapter probe

Device might experience a bad state on probe time, making impossible
to the function i40e_probe() to successfully complete.

In these cases, for example we observed the following message in
kernel log:

  [22.6397] i40e 0002:01:00.0: capability discovery failed, err OK aq_err I40E_AQ_RC_EMODE

This patch forces a global reset to happen on driver probe to avoid
the issue.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ad4cf63..f686c4a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10928,7 +10928,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	static u16 pfs_found;
 	u16 wol_nvm_bits;
 	u16 link_status;
-	int err;
+	int err, globr_probe = 1;
 	u32 val;
 	u32 i;
 	u8 set_fc_aq_fail;
@@ -11009,6 +11009,11 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (debug < -1)
 		pf->hw.debug

Re: [PATCH net v2] igb: re-assign hw address pointer on reset after PCI error

2016-12-02 Thread Guilherme G. Piccoli
On 11/10/2016 04:46 PM, Guilherme G. Piccoli wrote:
> Whenever the igb driver detects the result of a read operation returns
> a value composed only by F's (like 0x), it will detach the
> net_device, clear the hw_addr pointer and warn to the user that adapter's
> link is lost - those steps happen on igb_rd32().
> 
> In case a PCI error happens on Power architecture, there's a recovery
> mechanism called EEH, that will reset the PCI slot and call driver's
> handlers to reset the adapter and network functionality as well.
> 
> We observed that once hw_addr is NULL after the error is detected on
> igb_rd32(), it's never assigned back, so in the process of resetting
> the network functionality we got a NULL pointer dereference in both
> igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid
> such bug, this patch re-assigns the hw_addr value in the slot_reset
> handler.
> 
> Reported-by: Anthony H. Thai <aht...@us.ibm.com>
> Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
> Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/intel/igb/igb_main.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
> b/drivers/net/ethernet/intel/igb/igb_main.c
> index edc9a6a..136ee9e 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -7878,6 +7878,11 @@ static pci_ers_result_t igb_io_slot_reset(struct 
> pci_dev *pdev)
>   pci_enable_wake(pdev, PCI_D3hot, 0);
>   pci_enable_wake(pdev, PCI_D3cold, 0);
> 
> + /* In case of PCI error, adapter lose its HW address
> +  * so we should re-assign it here.
> +  */
> + hw->hw_addr = adapter->io_addr;
> +
>   igb_reset(adapter);
>   wr32(E1000_WUS, ~0);
>   result = PCI_ERS_RESULT_RECOVERED;
> 

Ping?

Sorry to annoy, any news on this?
Thanks in advance!

Cheers,


Guilherme



[PATCH net v2] igb: re-assign hw address pointer on reset after PCI error

2016-11-10 Thread Guilherme G. Piccoli
Whenever the igb driver detects the result of a read operation returns
a value composed only by F's (like 0x), it will detach the
net_device, clear the hw_addr pointer and warn to the user that adapter's
link is lost - those steps happen on igb_rd32().

In case a PCI error happens on Power architecture, there's a recovery
mechanism called EEH, that will reset the PCI slot and call driver's
handlers to reset the adapter and network functionality as well.

We observed that once hw_addr is NULL after the error is detected on
igb_rd32(), it's never assigned back, so in the process of resetting
the network functionality we got a NULL pointer dereference in both
igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid
such bug, this patch re-assigns the hw_addr value in the slot_reset
handler.

Reported-by: Anthony H. Thai <aht...@us.ibm.com>
Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index edc9a6a..136ee9e 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -7878,6 +7878,11 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev 
*pdev)
pci_enable_wake(pdev, PCI_D3hot, 0);
pci_enable_wake(pdev, PCI_D3cold, 0);
 
+   /* In case of PCI error, adapter lose its HW address
+* so we should re-assign it here.
+*/
+   hw->hw_addr = adapter->io_addr;
+
igb_reset(adapter);
wr32(E1000_WUS, ~0);
result = PCI_ERS_RESULT_RECOVERED;
-- 
2.1.0



Re: [Intel-wired-lan] [PATCH net] igb: re-assign hw address pointer on reset after PCI error

2016-11-10 Thread Guilherme G. Piccoli
On 11/09/2016 03:12 PM, Alexander Duyck wrote:
> On Mon, Oct 31, 2016 at 1:12 PM, Guilherme G. Piccoli
> <gpicc...@linux.vnet.ibm.com> wrote:
>> Whenever the igb driver detects the result of a read operation returns
>> a value composed only by F's (like 0x), it will detach the
>> net_device, clear the hw_addr pointer and warn to the user that adapter's
>> link is lost - those steps happen on igb_rd32().
>>
>> In case a PCI error happens on Power architecture, there's a recovery
>> mechanism called EEH, that will reset the PCI slot and call driver's
>> handlers to reset the adapter and network functionality as well.
>>
>> We observed that once hw_addr is NULL after the error is detected on
>> igb_rd32(), it's never assigned back, so in the process of resetting
>> the network functionality we got a NULL pointer dereference in both
>> igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid
>> such bug, we re-assign the hw_addr value in the beginning of the
>> function igb_reset(), in case the hw_addr is NULL when we reach that
>> path.
>>
>> Reported-by: Anthony H. Thai <aht...@us.ibm.com>
>> Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
>> Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
>> ---
>>  drivers/net/ethernet/intel/igb/igb_main.c | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
>> b/drivers/net/ethernet/intel/igb/igb_main.c
>> index edc9a6a..c19119c 100644
>> --- a/drivers/net/ethernet/intel/igb/igb_main.c
>> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
>> @@ -1873,6 +1873,13 @@ void igb_reset(struct igb_adapter *adapter)
>> struct e1000_fc_info *fc = >fc;
>> u32 pba, hwm;
>>
>> +   /* In case of PCI error, adapter might have lost its HW
>> +* address; if we reached this point after an error scenario,
>> +* we should re-assign the hw_addr based on the saved io_addr.
>> +*/
>> +   if (!hw->hw_addr)
>> +   hw->hw_addr = adapter->io_addr;
>> +
>> /* Repartition Pba for greater than 9k mtu
>>  * To take effect CTRL.RST is required.
>>  */
> 
> It seems like this would have the potential to get noisy pretty
> quickly since every reset would retrigger this.
> 
> It might make more sense to move this line into igb_io_slot_reset and
> igb_resume where the device would have gone through a reset of some
> sort and then had the state of the device restored and the device
> memory access was re-enabled via pci_enable_device_mem.  Also there is
> no point in really doing the "if" since you should always be okay to
> overwrite hw->hw_addr with adapter->io_addr.

Thanks for the good suggestion Alexander, will send a V2.
Cheers,

Guilherme


> 
> Thanks.
> 
> - Alex
> 



[PATCH net] ehea: fix operation state report

2016-11-03 Thread Guilherme G. Piccoli
Currently the ehea driver is missing a call to netif_carrier_off()
before the interface bring-up; this is necessary in order to
initialize the __LINK_STATE_NOCARRIER bit in the net_device state
field. Otherwise, we observe state UNKNOWN on "ip address" command
output.

This patch adds a call to netif_carrier_off() on ehea's net device
open callback.

Reported-by: Xiong Zhou <z...@redhat.com>
Reference-ID: IBM bz #137702, Red Hat bz #1089134
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ehea/ehea_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c 
b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 54efa9a..bd719e2 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -2446,6 +2446,8 @@ static int ehea_open(struct net_device *dev)
 
netif_info(port, ifup, dev, "enabling port\n");
 
+   netif_carrier_off(dev);
+
ret = ehea_up(dev);
if (!ret) {
port_napi_enable(port);
-- 
2.1.0



[PATCH net] igb: re-assign hw address pointer on reset after PCI error

2016-10-31 Thread Guilherme G. Piccoli
Whenever the igb driver detects the result of a read operation returns
a value composed only by F's (like 0x), it will detach the
net_device, clear the hw_addr pointer and warn to the user that adapter's
link is lost - those steps happen on igb_rd32().

In case a PCI error happens on Power architecture, there's a recovery
mechanism called EEH, that will reset the PCI slot and call driver's
handlers to reset the adapter and network functionality as well.

We observed that once hw_addr is NULL after the error is detected on
igb_rd32(), it's never assigned back, so in the process of resetting
the network functionality we got a NULL pointer dereference in both
igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid
such bug, we re-assign the hw_addr value in the beginning of the
function igb_reset(), in case the hw_addr is NULL when we reach that
path.

Reported-by: Anthony H. Thai <aht...@us.ibm.com>
Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index edc9a6a..c19119c 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1873,6 +1873,13 @@ void igb_reset(struct igb_adapter *adapter)
struct e1000_fc_info *fc = >fc;
u32 pba, hwm;
 
+   /* In case of PCI error, adapter might have lost its HW
+* address; if we reached this point after an error scenario,
+* we should re-assign the hw_addr based on the saved io_addr.
+*/
+   if (!hw->hw_addr)
+   hw->hw_addr = adapter->io_addr;
+
/* Repartition Pba for greater than 9k mtu
 * To take effect CTRL.RST is required.
 */
-- 
2.1.0



[PATCH net v2] tg3: Avoid NULL pointer dereference in tg3_io_error_detected()

2016-09-29 Thread Guilherme G. Piccoli
From: Milton Miller <milt...@us.ibm.com>

While the driver is probing the adapter, an error may occur before the
netdev structure is allocated and attached to pci_dev. In this case,
not only netdev isn't available, but the tg3 private structure is also
not available as it is just math from the NULL pointer, so dereferences
must be skipped.

The following trace is seen when the error is triggered:

  [1.402247] Unable to handle kernel paging request for data at address 
0x1a99
  [1.402410] Faulting instruction address: 0xc07e33f8
  [1.402450] Oops: Kernel access of bad area, sig: 11 [#1]
  [1.402481] SMP NR_CPUS=2048 NUMA PowerNV
  [1.402513] Modules linked in:
  [1.402545] CPU: 0 PID: 651 Comm: eehd Not tainted 4.4.0-36-generic #55-Ubuntu
  [1.402591] task: c01fe4e42a20 ti: c01fe4e88000 task.ti: 
c01fe4e88000
  [1.402742] NIP: c07e33f8 LR: c07e3164 CTR: c0595ea0
  [1.402787] REGS: c01fe4e8b790 TRAP: 0300   Not tainted  (4.4.0-36-generic)
  [1.402832] MSR: 90019033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28000422  
XER: 2000
  [1.403058] CFAR: c0008468 DAR: 1a99 DSISR: 4200 
SOFTE: 1
  GPR00: c07e3164 c01fe4e8ba10 c15c5e00 
  GPR04: 0001  0039 0299
  GPR08:  0001 c01fe4e88000 0006
  GPR12:  cfb4 c00e6558 c03ca1bffd00
  GPR16:    
  GPR20:    c0d52768
  GPR24: c0d52740 0100 c03ca1b52000 0002
  GPR28: 0900  c152a0c0 c03ca1b52000
  [1.404226] NIP [c07e33f8] tg3_io_error_detected+0x308/0x340
  [1.404265] LR [c07e3164] tg3_io_error_detected+0x74/0x340

This patch avoids the NULL pointer dereference by moving the access after
the netdev NULL pointer check on tg3_io_error_detected(). Also, we add a
check for netdev being NULL on tg3_io_resume() [suggested by Michael Chan].

Fixes: 0486a063b1ff ("tg3: prevent ifup/ifdown during PCI error recovery")
Fixes: dfc8f370316b ("net/tg3: Release IRQs on permanent error")
Tested-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
Signed-off-by: Milton Miller <milt...@us.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---

  * v2 changelog: added netdev NULL check on tg3_io_resume() as per
Michael Chan's suggestion.

 drivers/net/ethernet/broadcom/tg3.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index a2551bc..ea967df 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -18122,14 +18122,14 @@ static pci_ers_result_t tg3_io_error_detected(struct 
pci_dev *pdev,
 
rtnl_lock();
 
-   /* We needn't recover from permanent error */
-   if (state == pci_channel_io_frozen)
-   tp->pcierr_recovery = true;
-
/* We probably don't have netdev yet */
if (!netdev || !netif_running(netdev))
goto done;
 
+   /* We needn't recover from permanent error */
+   if (state == pci_channel_io_frozen)
+   tp->pcierr_recovery = true;
+
tg3_phy_stop(tp);
 
tg3_netif_stop(tp);
@@ -18226,7 +18226,7 @@ static void tg3_io_resume(struct pci_dev *pdev)
 
rtnl_lock();
 
-   if (!netif_running(netdev))
+   if (!netdev || !netif_running(netdev))
goto done;
 
tg3_full_lock(tp, 0);
-- 
2.1.0



Re: [PATCH RFC net-next] bnx2x: avoid printing unnecessary messages during register dump

2016-09-29 Thread Guilherme G. Piccoli


On 09/27/2016 11:43 PM, David Miller wrote:
> From: "Guilherme G. Piccoli" <gpicc...@linux.vnet.ibm.com>
> Date: Tue, 27 Sep 2016 15:33:54 -0300
> 
>> The bnx2x driver prints multiple error messages during register dump,
>> with "ethtool -d" for example. The driver even warn that many messages
>> might be seen during the register dump, but they are harmless. A typical
>> kernel log after register dump looks like this:
>>
>>   [9.375] bnx2x: [bnx2x_get_regs:987(net0)]Generating register dump. Might 
>> trigger harmless GRC timeouts
>>   [9.439] bnx2x: [bnx2x_attn_int_deasserted3:4342(net0)]LATCHED attention 
>> 0x0400 (masked)
>>   [9.439] bnx2x: [bnx2x_attn_int_deasserted3:4346(net0)]GRC time-out 
>> 0x010580cd
>>   [...]
>>
>> The notation [...] means that some messages were supressed - in our
>> tests we saw 78 more "LATCHED attention" and "GRC time-out" messages,
>> supressed here.
>>
>> This patch avoid these messages to be printed on register dump instead
>> of just warn they are harmless.
>>
>> Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
> 
> Although "ethtool -d" is really a debugging facility, I still think that
> serious care should be placed into arranging what gets dumped in such
> a way that such access timeouts and errors are minimized.
> 

David, thanks for your comment. I confess I didn't understand your
statement quite well. You say we shouldn't dump registers that will
cause timeouts, that's it?

If yes, I guess this is a valid point. We will however loose some debug
information (as you mentioned, 'ethtool -d' is a debug facility). Now,
since I'm no expert in QLogic adapter hw/fw, I want to ask Yuval/Ariel
why those timeouts are hit anyway. Are they completely harmless?

In my understanding/opinion, hiding the messages entirely (as this patch
does) OR avoid the timeouts by disabling some registers' dump are both
better alternatives than the current behavior of the driver.

Thanks,



Guilherme



Re: [Intel-wired-lan] [PATCH net] i40e: avoid NULL pointer dereference and recursive errors on early PCI error

2016-09-27 Thread Guilherme G. Piccoli
On 09/27/2016 07:41 PM, Keller, Jacob E wrote:
> On Tue, 2016-09-27 at 18:14 -0300, Guilherme G. Piccoli wrote:
>> Although rare, it's possible to hit PCI error early on device
>> probe, meaning possibly some structs are not entirely initialized,
>> and some might even be completely uninitialized, leading to NULL
>> pointer dereference.
>>
>> The i40e driver currently presents a "bad" behavior if device hits
>> such early PCI error: firstly, the struct i40e_pf might not be
>> attached to pci_dev yet, leading to a NULL pointer dereference on
>> access to pf->state.
>>
> 
> Oops! Nice find!
> 
>> Even checking if the struct is NULL and avoiding the access in that
>> case isn't enough, since the driver cannot recover from PCI error
>> that early; in our experiments we saw multiple failures on kernel
>> log, like:
>>
>>   [549.664] i40e 0007:01:00.1: Initial pf_reset failed: -15
>>   [549.664] i40e: probe of 0007:01:00.1 failed with error -15
>>   [...]
>>   [871.644] i40e 0007:01:00.1: The driver for the device stopped
>> because the
>>   device firmware failed to init. Try updating your NVM image.
>>   [871.644] i40e: probe of 0007:01:00.1 failed with error -32
>>   [...]
>>   [872.516] i40e 0007:01:00.0: ARQ: Unknown event 0x ignored
>>
>> Between the first probe failure (error -15) and the second (error
>> -32)
>> another PCI error happened due to the first bad probe. Also, driver
>> started to flood console with those ARQ event messages.
>>
>> This patch will prevent these issues by allowing error recovery
>> mechanism to remove the failed device from the system instead of
>> trying to recover from early PCI errors during device probe.
>>
> 
> This seems reasonable.
> 
>> Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c
>> b/drivers/net/ethernet/intel/i40e/i40e_main.c
>> index d0b3a1b..dad15b6 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
>> @@ -11360,6 +11360,12 @@ static pci_ers_result_t
>> i40e_pci_error_detected(struct pci_dev *pdev,
>>  
>>  dev_info(>dev, "%s: error %d\n", __func__, error);
>>  
>> +if (!pf) {
>> +dev_info(>dev,
>> + "Cannot recover - error happened during
>> device probe\n");
>> +return PCI_ERS_RESULT_DISCONNECT;
>> +}
>> +
> 
> Looks good to me.
> 
> Acked-by: Jacob Keller <jacob.e.kel...@intel.com>
> 
> Thanks for the bug fix and detailed explanation!

Nice Jacob, thanks very much for the review and ack.

Cheers,



Guilherme


> Regards,
> Jake
> 
>>  /* shutdown all operations */
>>  if (!test_bit(__I40E_SUSPENDED, >state)) {
>>  rtnl_lock();



Re: [PATCH net] tg3: Avoid NULL pointer dereference in tg3_io_error_detected()

2016-09-27 Thread Guilherme G. Piccoli

On 09/27/2016 05:58 PM, Michael Chan wrote:

On Tue, Sep 27, 2016 at 1:05 PM, Guilherme G. Piccoli
<gpicc...@linux.vnet.ibm.com> wrote:

From: Milton Miller <milt...@us.ibm.com>

While the driver is probing the adapter, an error may occur before the
netdev structure is allocated and attached to pci_dev. In this case,
not only netdev isn't available, but the tg3 private structure is also
not available as it is just math from the NULL pointer, so dereferences
must be skipped.

The following trace is seen when the error is triggered:

   [1.402247] Unable to handle kernel paging request for data at address 
0x1a99
   [1.402410] Faulting instruction address: 0xc07e33f8
   [1.402450] Oops: Kernel access of bad area, sig: 11 [#1]
   [1.402481] SMP NR_CPUS=2048 NUMA PowerNV
   [1.402513] Modules linked in:
   [1.402545] CPU: 0 PID: 651 Comm: eehd Not tainted 4.4.0-36-generic #55-Ubuntu
   [1.402591] task: c01fe4e42a20 ti: c01fe4e88000 task.ti: 
c01fe4e88000
   [1.402742] NIP: c07e33f8 LR: c07e3164 CTR: c0595ea0
   [1.402787] REGS: c01fe4e8b790 TRAP: 0300   Not tainted  
(4.4.0-36-generic)
   [1.402832] MSR: 90019033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28000422  
XER: 2000
   [1.403058] CFAR: c0008468 DAR: 1a99 DSISR: 4200 
SOFTE: 1
   GPR00: c07e3164 c01fe4e8ba10 c15c5e00 
   GPR04: 0001  0039 0299
   GPR08:  0001 c01fe4e88000 0006
   GPR12:  cfb4 c00e6558 c03ca1bffd00
   GPR16:    
   GPR20:    c0d52768
   GPR24: c0d52740 0100 c03ca1b52000 0002
   GPR28: 0900  c152a0c0 c03ca1b52000
   [1.404226] NIP [c07e33f8] tg3_io_error_detected+0x308/0x340
   [1.404265] LR [c07e3164] tg3_io_error_detected+0x74/0x340

This patch avoids the NULL pointer dereference by moving the access after
the netdev NULL pointer check on tg3_io_error_detected().

Fixes: 0486a063b1ff ("tg3: prevent ifup/ifdown during PCI error recovery")
Fixes: dfc8f370316b ("net/tg3: Release IRQs on permanent error")
Tested-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
Signed-off-by: Milton Miller <milt...@us.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>


Looks good.  Do we need to add !netdev check in tg3_io_resume()?


Thanks Michael. It's a good point - I didn't trigger any error without 
the check, but looking at error handlers, every one seems to have this 
check except tg3_io_resume().


Do you want us to send a v2 including this check? Or maybe another patch?

Cheers,



Guilherme



[PATCH net] i40e: avoid NULL pointer dereference and recursive errors on early PCI error

2016-09-27 Thread Guilherme G. Piccoli
Although rare, it's possible to hit PCI error early on device
probe, meaning possibly some structs are not entirely initialized,
and some might even be completely uninitialized, leading to NULL
pointer dereference.

The i40e driver currently presents a "bad" behavior if device hits
such early PCI error: firstly, the struct i40e_pf might not be
attached to pci_dev yet, leading to a NULL pointer dereference on
access to pf->state.

Even checking if the struct is NULL and avoiding the access in that
case isn't enough, since the driver cannot recover from PCI error
that early; in our experiments we saw multiple failures on kernel
log, like:

  [549.664] i40e 0007:01:00.1: Initial pf_reset failed: -15
  [549.664] i40e: probe of 0007:01:00.1 failed with error -15
  [...]
  [871.644] i40e 0007:01:00.1: The driver for the device stopped because the
  device firmware failed to init. Try updating your NVM image.
  [871.644] i40e: probe of 0007:01:00.1 failed with error -32
  [...]
  [872.516] i40e 0007:01:00.0: ARQ: Unknown event 0x ignored

Between the first probe failure (error -15) and the second (error -32)
another PCI error happened due to the first bad probe. Also, driver
started to flood console with those ARQ event messages.

This patch will prevent these issues by allowing error recovery
mechanism to remove the failed device from the system instead of
trying to recover from early PCI errors during device probe.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index d0b3a1b..dad15b6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11360,6 +11360,12 @@ static pci_ers_result_t i40e_pci_error_detected(struct 
pci_dev *pdev,
 
dev_info(>dev, "%s: error %d\n", __func__, error);
 
+   if (!pf) {
+   dev_info(>dev,
+"Cannot recover - error happened during device 
probe\n");
+   return PCI_ERS_RESULT_DISCONNECT;
+   }
+
/* shutdown all operations */
if (!test_bit(__I40E_SUSPENDED, >state)) {
rtnl_lock();
-- 
2.1.0



[PATCH net] tg3: Avoid NULL pointer dereference in tg3_io_error_detected()

2016-09-27 Thread Guilherme G. Piccoli
From: Milton Miller <milt...@us.ibm.com>

While the driver is probing the adapter, an error may occur before the
netdev structure is allocated and attached to pci_dev. In this case,
not only netdev isn't available, but the tg3 private structure is also
not available as it is just math from the NULL pointer, so dereferences
must be skipped.

The following trace is seen when the error is triggered:

  [1.402247] Unable to handle kernel paging request for data at address 
0x1a99
  [1.402410] Faulting instruction address: 0xc07e33f8
  [1.402450] Oops: Kernel access of bad area, sig: 11 [#1]
  [1.402481] SMP NR_CPUS=2048 NUMA PowerNV
  [1.402513] Modules linked in:
  [1.402545] CPU: 0 PID: 651 Comm: eehd Not tainted 4.4.0-36-generic #55-Ubuntu
  [1.402591] task: c01fe4e42a20 ti: c01fe4e88000 task.ti: 
c01fe4e88000
  [1.402742] NIP: c07e33f8 LR: c07e3164 CTR: c0595ea0
  [1.402787] REGS: c01fe4e8b790 TRAP: 0300   Not tainted  (4.4.0-36-generic)
  [1.402832] MSR: 90019033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28000422  
XER: 2000
  [1.403058] CFAR: c0008468 DAR: 1a99 DSISR: 4200 
SOFTE: 1
  GPR00: c07e3164 c01fe4e8ba10 c15c5e00 
  GPR04: 0001  0039 0299
  GPR08:  0001 c01fe4e88000 0006
  GPR12:  cfb4 c00e6558 c03ca1bffd00
  GPR16:    
  GPR20:    c0d52768
  GPR24: c0d52740 0100 c03ca1b52000 0002
  GPR28: 0900  c152a0c0 c03ca1b52000
  [1.404226] NIP [c07e33f8] tg3_io_error_detected+0x308/0x340
  [1.404265] LR [c07e3164] tg3_io_error_detected+0x74/0x340

This patch avoids the NULL pointer dereference by moving the access after
the netdev NULL pointer check on tg3_io_error_detected().

Fixes: 0486a063b1ff ("tg3: prevent ifup/ifdown during PCI error recovery")
Fixes: dfc8f370316b ("net/tg3: Release IRQs on permanent error")
Tested-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
Signed-off-by: Milton Miller <milt...@us.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/tg3.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index a2551bc..3a5fce7 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -18122,14 +18122,14 @@ static pci_ers_result_t tg3_io_error_detected(struct 
pci_dev *pdev,
 
rtnl_lock();
 
-   /* We needn't recover from permanent error */
-   if (state == pci_channel_io_frozen)
-   tp->pcierr_recovery = true;
-
/* We probably don't have netdev yet */
if (!netdev || !netif_running(netdev))
goto done;
 
+   /* We needn't recover from permanent error */
+   if (state == pci_channel_io_frozen)
+   tp->pcierr_recovery = true;
+
tg3_phy_stop(tp);
 
tg3_netif_stop(tp);
-- 
2.1.0



[PATCH RFC net-next] bnx2x: avoid printing unnecessary messages during register dump

2016-09-27 Thread Guilherme G. Piccoli
The bnx2x driver prints multiple error messages during register dump,
with "ethtool -d" for example. The driver even warn that many messages
might be seen during the register dump, but they are harmless. A typical
kernel log after register dump looks like this:

  [9.375] bnx2x: [bnx2x_get_regs:987(net0)]Generating register dump. Might 
trigger harmless GRC timeouts
  [9.439] bnx2x: [bnx2x_attn_int_deasserted3:4342(net0)]LATCHED attention 
0x0400 (masked)
  [9.439] bnx2x: [bnx2x_attn_int_deasserted3:4346(net0)]GRC time-out 0x010580cd
  [...]

The notation [...] means that some messages were supressed - in our
tests we saw 78 more "LATCHED attention" and "GRC time-out" messages,
supressed here.

This patch avoid these messages to be printed on register dump instead
of just warn they are harmless.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---

  * This was sent as RFC for two main reasons: firstly, I might be ignoring
some importance in showing these error messages during register dump.
Also, there are multiple ways to implement this idea - I just did the
first one that came to my head. We might also add a new flag on struct
bnx2x or even a new field. Any suggestions regarding the best
implementation are welcome.

 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h|  1 +
 .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c| 17 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   | 22 --
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 7dd7490..73f2713 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -2053,6 +2053,7 @@ void bnx2x_update_coalesce(struct bnx2x *bp);
 int bnx2x_get_cur_phy_idx(struct bnx2x *bp);
 
 bool bnx2x_port_after_undi(struct bnx2x *bp);
+bool bnx2x_is_reading_regs(void);
 
 static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
   int wait)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 85a7800..d7dc867 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -29,6 +29,8 @@
 #include "bnx2x_dump.h"
 #include "bnx2x_init.h"
 
+static int bnx2x_reading_regs;
+
 /* Note: in the format strings below %s is replaced by the queue-name which is
  * either its index or 'fcoe' for the fcoe queue. Make sure the format string
  * length does not exceed ETH_GSTRING_LEN - MAX_QUEUE_NAME_LEN + 2
@@ -981,19 +983,24 @@ static void bnx2x_get_regs(struct net_device *dev,
memcpy(p, _hdr, sizeof(struct dump_header));
p += dump_hdr.header_size + 1;
 
-   /* This isn't really an error, but since attention handling is going
-* to print the GRC timeouts using this macro, we use the same.
+   /* Actually read the registers - we use bnx2x_reading_regs to
+* avoid multiple unnecessary error messages to be printed on
+* kernel log when reading registers, like GRC timeouts.
 */
-   BNX2X_ERR("Generating register dump. Might trigger harmless GRC 
timeouts\n");
-
-   /* Actually read the registers */
+   bnx2x_reading_regs = 1;
__bnx2x_get_regs(bp, p);
+   bnx2x_reading_regs = 0;
 
/* Re-enable parity attentions */
bnx2x_clear_blocks_parity(bp);
bnx2x_enable_blocks_parity(bp);
 }
 
+inline bool bnx2x_is_reading_regs(void)
+{
+   return !!bnx2x_reading_regs;
+}
+
 static int bnx2x_get_preset_regs_len(struct net_device *dev, u32 preset)
 {
struct bnx2x *bp = netdev_priv(dev);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index fa3386b..392d14c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -4339,16 +4339,18 @@ static void bnx2x_attn_int_deasserted3(struct bnx2x 
*bp, u32 attn)
}
 
if (attn & EVEREST_LATCHED_ATTN_IN_USE_MASK) {
-   BNX2X_ERR("LATCHED attention 0x%08x (masked)\n", attn);
-   if (attn & BNX2X_GRC_TIMEOUT) {
-   val = CHIP_IS_E1(bp) ? 0 :
-   REG_RD(bp, MISC_REG_GRC_TIMEOUT_ATTN);
-   BNX2X_ERR("GRC time-out 0x%08x\n", val);
-   }
-   if (attn & BNX2X_GRC_RSV) {
-   val = CHIP_IS_E1(bp) ? 0 :
-   REG_RD(bp, MISC_REG_GRC_RSV_ATTN);
-   BNX2X_ERR("GRC reserved 0x%08x\n", val);
+   if (!bnx2x_is_reading_regs()) {
+   BNX2X_ERR("LATCHED attention 0x%08

[PATCH net] i40e: disable MSI-X interrupts if we cannot reserve enough vectors

2016-09-22 Thread Guilherme G. Piccoli
If we fail on allocating enough MSI-X interrupts, we should disable
them since they were previously enabled in this point of code.

Not disabling them can lead to WARN_ON() being triggered and subsequent
failure in enabling MSI as a fallback; the below message was shown without
this patch while we played with interrupt allocation in i40e driver:

[ 21.461346] sysfs: cannot create duplicate filename 
'/devices/pci0007:00/0007:00:00.0/0007:01:00.3/msi_irqs'
[ 21.461459] [ cut here ]
[ 21.461514] WARNING: CPU: 64 PID: 1155 at fs/sysfs/dir.c:31 
sysfs_warn_dup+0x88/0xc0

Also, we noticed that without this patch, if we modprobe the module without
enough MSI-X interrupts (triggering the above warning), unload the module
and re-load it again, we got a crash on the system.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index d0b3a1b..f8ebe78 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7721,6 +7721,7 @@ static int i40e_init_msix(struct i40e_pf *pf)
pf->flags &= ~I40E_FLAG_MSIX_ENABLED;
kfree(pf->msix_entries);
pf->msix_entries = NULL;
+   pci_disable_msix(pf->pdev);
return -ENODEV;
 
} else if (v_actual == I40E_MIN_MSIX) {
-- 
2.1.0



[PATCH net v2] bnx2x: don't reset chip on cleanup if PCI function is offline

2016-08-31 Thread Guilherme G. Piccoli
When PCI error is detected, in some architectures (like PowerPC) a slot
reset is performed - the driver's error handlers are in charge of "disable"
device before the reset, and re-enable it after a successful slot reset.

There are two cases though that another path is taken on the code: if the
slot reset is not successful or if too many errors already happened in the
specific adapter (meaning that possibly the device is experiencing a HW
failure that slot reset is not able to solve), the core PCI error mechanism
(called EEH in PowerPC) will remove the adapter from the system, since it
will consider this as a permanent failure on device. In this case, a path
is taken that leads to bnx2x_chip_cleanup() calling bnx2x_reset_hw(), which
then tries to perform a HW reset on chip. This reset won't succeed since
the HW is in a fault state, which can be seen by multiple messages on
kernel log like below:

bnx2x: [bnx2x_issue_dmae_with_comp:552(eth1)]DMAE timeout!
bnx2x: [bnx2x_write_dmae:600(eth1)]DMAE returned failure -1

After some time, the PCI error mechanism gives up on waiting the driver's
correct removal procedure and forcibly remove the adapter from the system.
We can see soft lockup while core PCI error mechanism is waiting for driver
to accomplish the right removal process.

This patch adds a verification to avoid a chip reset whenever the function
is in PCI error state - since this case is only reached when we have a
device being removed because of a permanent failure, the HW chip reset is
not expected to work fine neither is necessary.

Also, as a minor improvement in error path, we avoid the MCP information dump
in case of non-recoverable PCI error (when adapter is about to be removed),
since it will certainly fail.

Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
v2: Removed the unlikely attribute on bnx2x_fw_dump_lvl() if block.

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 97e8925..fa3386b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -772,6 +772,11 @@ void bnx2x_fw_dump_lvl(struct bnx2x *bp, const char *lvl)
(bp->common.bc_ver & 0xff00) >> 8,
(bp->common.bc_ver & 0xff));
 
+   if (pci_channel_offline(bp->pdev)) {
+   BNX2X_ERR("Cannot dump MCP info while in PCI error\n");
+   return;
+   }
+
val = REG_RD(bp, MCP_REG_MCPR_CPU_PROGRAM_COUNTER);
if (val == REG_RD(bp, MCP_REG_MCPR_CPU_PROGRAM_COUNTER))
BNX2X_ERR("%s" "MCP PC at 0x%x\n", lvl, val);
@@ -9415,10 +9420,16 @@ unload_error:
/* Release IRQs */
bnx2x_free_irq(bp);
 
-   /* Reset the chip */
-   rc = bnx2x_reset_hw(bp, reset_code);
-   if (rc)
-   BNX2X_ERR("HW_RESET failed\n");
+   /* Reset the chip, unless PCI function is offline. If we reach this
+* point following a PCI error handling, it means device is really
+* in a bad state and we're about to remove it, so reset the chip
+* is not a good idea.
+*/
+   if (!pci_channel_offline(bp->pdev)) {
+   rc = bnx2x_reset_hw(bp, reset_code);
+   if (rc)
+   BNX2X_ERR("HW_RESET failed\n");
+   }
 
/* Report UNLOAD_DONE to MCP */
bnx2x_send_unload_done(bp, keep_link);
-- 
2.1.0



Re: [PATCH net] bnx2x: don't reset chip on cleanup if PCI function is offline

2016-08-16 Thread Guilherme G. Piccoli

On 08/10/2016 10:28 AM, Guilherme G. Piccoli wrote:

On 08/10/2016 04:59 AM, Yuval Mintz wrote:

Why would the published resume()  from pci_error_handlers be called
in this

scenario?

It isn't. That's why I specifically commented on commit message:
"There are two
cases though that another path is taken on the code".

The code path reach bnx2x_chip_cleanup() on device removal from the
system,
as seen in the below call trace:

bnx2x_chip_cleanup+0x3c0/0x910 [bnx2x]
bnx2x_nic_unload+0x268/0xaf0 [bnx2x]
bnx2x_close+0x34/0x50 [bnx2x]
__dev_close_many+0xd4/0x150
dev_close_many+0xa8/0x160
rollback_registered_many+0x174/0x3f0
rollback_registered+0x40/0x70
unregister_netdevice_queue+0x98/0x110
unregister_netdev+0x34/0x50
__bnx2x_remove+0xa8/0x3a0 [bnx2x]
pci_device_remove+0x70/0x110


Makes sense.


Also, we avoid the MCP information dump in case of non-recoverable
PCI error (when adapter is about to be removed), since it will
certainly fail.


We should probably avoid several things here; Why specifically only
this?


For example, we shouldn't execute bnx2x_timer() in this scenario. But
I thought
it'd be too much to check every call of a timer function against PCI
channel state
just to avoid it's execution on this scenario, so I just let it
execute, since it seems
harmless.


+/* Reset the chip, unless PCI function is offline. If we reach
this
+ * point following a PCI error handling, it means device is
really
+ * in a bad state and we're about to remove it, so reset the chip
+ * is not a good idea.
+ */
+if (!pci_channel_offline(bp->pdev)) {
+rc = bnx2x_reset_hw(bp, reset_code);
+if (rc)
+BNX2X_ERR("HW_RESET failed\n");
+}


Why not simply check this at the beginning of the function?


Because I wasn't sure if I could drop the entire execution of
chip_cleanup(). I
tried to keep the most of this function aiming to shutdown the module
in a
gentle way, like cleaning MAC, stopping queues...but again, I'm open to
suggestions and gladly will change this in v2 if you think it's for
the best.


Problem is I won't be able to have a more thorough review of this in
the next
couple of days - and other than code-review I won't have a reasonable way
of testing this [I can use aer_inject, but I don't have your magical EEH
error injections, and I'm not at all certain it would suffice for a
good testing ].

I agree that even as-is, what you're suggesting is an improvement to the
existing flow - so it's basically up to dave, i.e., whether to take a
half fix
or wait for a more thorough one.



Thanks for your consideration. The point is: the important part of this
patch is avoiding the reset_hw() path, since it will clearly fail and
generate soft lockups. This is the fix per se, the other part (regarding
the MCP dump) is just an improvement; surely we have more potential
improvements to explore, but they wouldn't be fixes, only code
improvements.

So, I wouldn't call this a half fix, but yet, a completely fix with a
small improvement as a bonus :)

Cheers,


Guilherme



David, sorry to bother you - maybe you didn't notice this.
Any comments?

Thanks in advance,


Guilherme



Re: [PATCH net] bnx2x: don't reset chip on cleanup if PCI function is offline

2016-08-10 Thread Guilherme G. Piccoli

On 08/10/2016 04:59 AM, Yuval Mintz wrote:

Why would the published resume()  from pci_error_handlers be called in this

scenario?

It isn't. That's why I specifically commented on commit message: "There are two
cases though that another path is taken on the code".

The code path reach bnx2x_chip_cleanup() on device removal from the system,
as seen in the below call trace:

bnx2x_chip_cleanup+0x3c0/0x910 [bnx2x]
bnx2x_nic_unload+0x268/0xaf0 [bnx2x]
bnx2x_close+0x34/0x50 [bnx2x]
__dev_close_many+0xd4/0x150
dev_close_many+0xa8/0x160
rollback_registered_many+0x174/0x3f0
rollback_registered+0x40/0x70
unregister_netdevice_queue+0x98/0x110
unregister_netdev+0x34/0x50
__bnx2x_remove+0xa8/0x3a0 [bnx2x]
pci_device_remove+0x70/0x110


Makes sense.


Also, we avoid the MCP information dump in case of non-recoverable
PCI error (when adapter is about to be removed), since it will certainly fail.


We should probably avoid several things here; Why specifically only this?


For example, we shouldn't execute bnx2x_timer() in this scenario. But I thought
it'd be too much to check every call of a timer function against PCI channel 
state
just to avoid it's execution on this scenario, so I just let it execute, since 
it seems
harmless.


+   /* Reset the chip, unless PCI function is offline. If we reach this
+* point following a PCI error handling, it means device is really
+* in a bad state and we're about to remove it, so reset the chip
+* is not a good idea.
+*/
+   if (!pci_channel_offline(bp->pdev)) {
+   rc = bnx2x_reset_hw(bp, reset_code);
+   if (rc)
+   BNX2X_ERR("HW_RESET failed\n");
+   }


Why not simply check this at the beginning of the function?


Because I wasn't sure if I could drop the entire execution of chip_cleanup(). I
tried to keep the most of this function aiming to shutdown the module in a
gentle way, like cleaning MAC, stopping queues...but again, I'm open to
suggestions and gladly will change this in v2 if you think it's for the best.


Problem is I won't be able to have a more thorough review of this in the next
couple of days - and other than code-review I won't have a reasonable way
of testing this [I can use aer_inject, but I don't have your magical EEH
error injections, and I'm not at all certain it would suffice for a good 
testing ].

I agree that even as-is, what you're suggesting is an improvement to the
existing flow - so it's basically up to dave, i.e., whether to take a half fix
or wait for a more thorough one.



Thanks for your consideration. The point is: the important part of this 
patch is avoiding the reset_hw() path, since it will clearly fail and 
generate soft lockups. This is the fix per se, the other part (regarding 
the MCP dump) is just an improvement; surely we have more potential 
improvements to explore, but they wouldn't be fixes, only code improvements.


So, I wouldn't call this a half fix, but yet, a completely fix with a 
small improvement as a bonus :)


Cheers,


Guilherme



Re: [PATCH net] bnx2x: don't reset chip on cleanup if PCI function is offline

2016-08-09 Thread Guilherme G. Piccoli

On 08/09/2016 09:14 AM, Yuval Mintz wrote:

When PCI error is detected, in some architectures (like PowerPC) a slot reset is
performed - the driver's error handlers are in charge of "disable"
device before the reset, and re-enable it after a successful slot reset.

There are two cases though that another path is taken on the code: if the slot
reset is not successful or if too many errors already happened in the specific
adapter (meaning that possibly the device is experiencing a HW failure that slot
reset is not able to solve), the core PCI error mechanism (called EEH in 
PowerPC)
will remove the adapter from the system, since it will consider this as a
permanent failure on device.

Why would the published resume()  from pci_error_handlers be called in this 
scenario?


It isn't. That's why I specifically commented on commit message: "There 
are two cases though that another path is taken on the code".


The code path reach bnx2x_chip_cleanup() on device removal from the 
system, as seen in the below call trace:


bnx2x_chip_cleanup+0x3c0/0x910 [bnx2x]
bnx2x_nic_unload+0x268/0xaf0 [bnx2x]
bnx2x_close+0x34/0x50 [bnx2x]
__dev_close_many+0xd4/0x150
dev_close_many+0xa8/0x160
rollback_registered_many+0x174/0x3f0
rollback_registered+0x40/0x70
unregister_netdevice_queue+0x98/0x110
unregister_netdev+0x34/0x50
__bnx2x_remove+0xa8/0x3a0 [bnx2x]
pci_device_remove+0x70/0x110





Also, we avoid the MCP information dump in case of non-recoverable PCI error
(when adapter is about to be removed), since it will certainly fail.


We should probably avoid several things here; Why specifically only this?


For example, we shouldn't execute bnx2x_timer() in this scenario. But I 
thought it'd be too much to check every call of a timer function against 
PCI channel state just to avoid it's execution on this scenario, so I 
just let it execute, since it seems harmless.


On the other hand, MCP dump is, as you said below, a slowpath and surely 
will fail in the following conditional (since addresses are all like 
0xFFF...):


   /* sanity */
if (trace_shmem_base < MCPR_SCRATCH_BASE(bp) + 
MCPR_TRACE_BUFFER_SIZE ||

trace_shmem_base >= MCPR_SCRATCH_BASE(bp) +
SCRATCH_BUFFER_SIZE(bp))

So, I added the check to at least be more informative on logs on why it 
failed. If you desire, we can avoid the timer execution (maybe a 
del_timer() instead of checking for PCI state?) and bnx2x_period_task() 
run in case of PCI permanent failure. I'm open to suggestions!




+   if (unlikely(pci_channel_offline(bp->pdev))) {
+   BNX2X_ERR("Cannot dump MCP info while in PCI error\n");
+   return;
+   }
+

Nitpicky, but I don't think there's reason to add 'unlikely' to a purely 
slowpath
Configuration.


OK, your call. I can remove it on v2, no problem.





val = REG_RD(bp, MCP_REG_MCPR_CPU_PROGRAM_COUNTER);
if (val == REG_RD(bp, MCP_REG_MCPR_CPU_PROGRAM_COUNTER))
BNX2X_ERR("%s" "MCP PC at 0x%x\n", lvl, val); @@ -9415,10
+9420,16 @@ unload_error:




-   /* Reset the chip */
-   rc = bnx2x_reset_hw(bp, reset_code);
-   if (rc)
-   BNX2X_ERR("HW_RESET failed\n");
+   /* Reset the chip, unless PCI function is offline. If we reach this
+* point following a PCI error handling, it means device is really
+* in a bad state and we're about to remove it, so reset the chip
+* is not a good idea.
+*/
+   if (!pci_channel_offline(bp->pdev)) {
+   rc = bnx2x_reset_hw(bp, reset_code);
+   if (rc)
+   BNX2X_ERR("HW_RESET failed\n");
+   }


Why not simply check this at the beginning of the function?


Because I wasn't sure if I could drop the entire execution of 
chip_cleanup(). I tried to keep the most of this function aiming to 
shutdown the module in a gentle way, like cleaning MAC, stopping 
queues...but again, I'm open to suggestions and gladly will change this 
in v2 if you think it's for the best.


Thanks for your review,


Guilherme



[PATCH net] bnx2x: don't reset chip on cleanup if PCI function is offline

2016-08-08 Thread Guilherme G. Piccoli
When PCI error is detected, in some architectures (like PowerPC) a slot
reset is performed - the driver's error handlers are in charge of "disable"
device before the reset, and re-enable it after a successful slot reset.

There are two cases though that another path is taken on the code: if the
slot reset is not successful or if too many errors already happened in the
specific adapter (meaning that possibly the device is experiencing a HW
failure that slot reset is not able to solve), the core PCI error mechanism
(called EEH in PowerPC) will remove the adapter from the system, since it
will consider this as a permanent failure on device. In this case, a path
is taken that leads to bnx2x_chip_cleanup() calling bnx2x_reset_hw(), which
then tries to perform a HW reset on chip. This reset won't succeed since
the HW is in a fault state, which can be seen by multiple messages on
kernel log like below:

bnx2x: [bnx2x_issue_dmae_with_comp:552(eth1)]DMAE timeout!
bnx2x: [bnx2x_write_dmae:600(eth1)]DMAE returned failure -1

After some time, the PCI error mechanism gives up on waiting the driver's
correct removal procedure and forcibly remove the adapter from the system.
We can see soft lockup while core PCI error mechanism is waiting for driver
to accomplish the right removal process.

This patch adds a verification to avoid a chip reset whenever the function
is in PCI error state - since this case is only reached when we have a
device being removed because of a permanent failure, the HW chip reset is
not expected to work fine neither is necessary.

Also, we avoid the MCP information dump in case of non-recoverable PCI
error (when adapter is about to be removed), since it will certainly fail.

Reported-by: Harsha Thyagaraja <hathy...@in.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 97e8925..e6329dc 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -772,6 +772,11 @@ void bnx2x_fw_dump_lvl(struct bnx2x *bp, const char *lvl)
(bp->common.bc_ver & 0xff00) >> 8,
(bp->common.bc_ver & 0xff));
 
+   if (unlikely(pci_channel_offline(bp->pdev))) {
+   BNX2X_ERR("Cannot dump MCP info while in PCI error\n");
+   return;
+   }
+
val = REG_RD(bp, MCP_REG_MCPR_CPU_PROGRAM_COUNTER);
if (val == REG_RD(bp, MCP_REG_MCPR_CPU_PROGRAM_COUNTER))
BNX2X_ERR("%s" "MCP PC at 0x%x\n", lvl, val);
@@ -9415,10 +9420,16 @@ unload_error:
/* Release IRQs */
bnx2x_free_irq(bp);
 
-   /* Reset the chip */
-   rc = bnx2x_reset_hw(bp, reset_code);
-   if (rc)
-   BNX2X_ERR("HW_RESET failed\n");
+   /* Reset the chip, unless PCI function is offline. If we reach this
+* point following a PCI error handling, it means device is really
+* in a bad state and we're about to remove it, so reset the chip
+* is not a good idea.
+*/
+   if (!pci_channel_offline(bp->pdev)) {
+   rc = bnx2x_reset_hw(bp, reset_code);
+   if (rc)
+   BNX2X_ERR("HW_RESET failed\n");
+   }
 
/* Report UNLOAD_DONE to MCP */
bnx2x_send_unload_done(bp, keep_link);
-- 
2.1.0



[PATCH net] be2net: perform temperature query in adapter regardless of its interface state

2016-07-26 Thread Guilherme G. Piccoli
The be2net driver performs fw temperature queries on be_worker() routine,
which is executed each second for each be_adapter. There is a frequency
threshold to avoid fw query to happens at each call to be_worker();
instead, currently a fw query occurs once in 64 runs of the procedure.

Nevertheless, this fw temperature query is invoked only for adapters which
interface is up, so we can see I/O errors on read of hwmon counters from
userspace (from tools like lm-sensors) in case we have adapters' functions
which interface is down.

This patch moves the fw query code to be invoked even if interface is down.
No functional changes were introduced.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index ed98ef1..706fdfa 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4996,6 +4996,10 @@ static void be_worker(struct work_struct *work)
struct be_rx_obj *rxo;
int i;
 
+   if (be_physfn(adapter) &&
+   MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
+   be_cmd_get_die_temperature(adapter);
+
/* when interrupts are not yet enabled, just reap any pending
 * mcc completions
 */
@@ -5014,10 +5018,6 @@ static void be_worker(struct work_struct *work)
be_cmd_get_stats(adapter, >stats_cmd);
}
 
-   if (be_physfn(adapter) &&
-   MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
-   be_cmd_get_die_temperature(adapter);
-
for_all_rx_queues(adapter, rxo, i) {
/* Replenish RX-queues starved due to memory
 * allocation failures.
-- 
2.1.0



Re: [PATCH net-next 1/2] be2net: set temperature value for all adapter's functions

2016-07-26 Thread Guilherme G. Piccoli

On 07/26/2016 05:26 AM, Sathya Perla wrote:

-Original Message-
From: Guilherme G. Piccoli [mailto:gpicc...@linux.vnet.ibm.com]

On 07/25/2016 07:48 AM, Sathya Perla wrote:

-Original Message-
From: Guilherme G. Piccoli [mailto:gpicc...@linux.vnet.ibm.com]

Temperature values on be2net driver are made available to userspace
via

hwmon abstraction, so tools like lm-

sensors can present them to the user.
The driver provides hwmon structures for each adapter's function.
Nevertheless, the temperature information come from fw queries
performed

by

be_worker() with some frequency, and this procedure is called with a

single function as argument; this means

that the temperature value is updated only in the specific function
that

was passed to be_worker().


This can lead to incongruency in reported temperature by a function,
or

in a worse scenario, some functions

might be unable to provide temperature info to userspace, if they

weren't fed with this information from fw in

be_worker() run.


Hi, I'm wondering if you are OK with the temperature value being 128s
old
(2/2 patch), then why is it a problem
if two different functions report a temperature value that is queried
a few seconds apart?
Also, you'll not have a scenario where the FW cmd succeeds for one
function and fails for other functions.
It's a common FW for the entire adapter.



This patch changes the way temperature is set in be2net driver. At

anytime the fw query is performed, it will set

the temperature value for all functions of the adapter, instead of
only

setting the temperature of the function

passed to be_worker().

If the possible inconsistency across functions is indeed a problem,
then a simpler solution would be to issue the FW cmd synchronously
when the sysfs attr is read, i.e., in
be_hwmon_show_temp() routine itself.



Hi Sathya, thanks very much for your quick reply. I agree with you that an
1 or 2 sec inconsistency wouldn't
harm, but the main problem we're seeing is that be_worker() is being
called with a single function as a parameter
- in our case, the last function is being passed as argument to
be_worker() multiple times in a row, and then we
have its temperature updated but the other functions' temperature set as
invalid.


Hi Guilherme, this doesn't sound right to me and is not expected. The
be_worker() routine must execute for *each* function every second.
Can you pls share the driver/fw version and any debug logs (with prints) you
may have and also lspci output.


Hi Sathya, indeed...this is _not right_...from my side heheh
Unfortunately I made a mistake in my analysis and ended up 
over-engineering a "solution" to an issue which root cause wasn't clear 
to me! I want to thank you for your relevant questions and the 
information you provided, which helped a lot to figure exactly what's 
going on.


Our issue is seen because some adapter's functions (3 out of 4) have 
their interface down, and the fw temperature queries are performed only 
for functions which interface is up. The following conditional avoids fw 
query to occur whenever adapter's interface is down:


  if (!netif_running(adapter->netdev))
[be_main.c:5002, kernel v4.7]

It seems harmless to change the fw query location to perform temperature 
read for all functions regardless the state of its interface - this will 
solve our issue. I wrote a simple patch (to "net", and not "net-next" 
anymore) to improve this driver's behavior.

I'll send it right after this message, please let me know what you think.

Again, thanks very much for your attention and sorry for my confusion.
Cheers,


Guilherme




Regarding the temperature update run on be_hwmon_show_temp(), it was an
idea too, but I was afraid in delay
this output too much - imagine some userspace tool reads hwmon attributes
for all functions almost at "same
time", supposing the fw command can't run in parallel, the "last" read
would need to wait 4 fw commands to
complete before showing it's output.


I don't see any issue even if the sensors program queries each function one
after another. These calls would only be
a few milli-seconds apart.


Besides, in a worse scenario, some "not-friendly" tool might issue lots of
reads to hwmon per second then
issuing lots of fw commands, which does not seem a good idea. Of course
this last case we can avoid by
implementing a counter or timer on be_hwmon_show_temp() to allow maximum
number of fw cmds in a time
frame.

Yes, this is not an issue. If the hwmon read is issued with-in a few seconds
of the previous read then you can just return the old temperature value.
We are anyway querying this value only once in 64s now.
But, I'd like to root-cause the issue you are seeing above before we "fix"
anything.

thanks,
-Sathya





Re: [PATCH net-next 1/2] be2net: set temperature value for all adapter's functions

2016-07-25 Thread Guilherme G. Piccoli

On 07/25/2016 07:48 AM, Sathya Perla wrote:

-Original Message-
From: Guilherme G. Piccoli [mailto:gpicc...@linux.vnet.ibm.com]

Temperature values on be2net driver are made available to userspace via

hwmon abstraction, so tools like lm-

sensors can present them to the user.
The driver provides hwmon structures for each adapter's function.
Nevertheless, the temperature information come from fw queries performed

by

be_worker() with some frequency, and this procedure is called with a

single function as argument; this means

that the temperature value is updated only in the specific function that

was passed to be_worker().


This can lead to incongruency in reported temperature by a function, or

in a worse scenario, some functions

might be unable to provide temperature info to userspace, if they

weren't fed with this information from fw in

be_worker() run.


Hi, I'm wondering if you are OK with the temperature value being 128s old
(2/2 patch), then why is it a problem
if two different functions report a temperature value that is queried a
few seconds apart?
Also, you'll not have a scenario where the FW cmd succeeds for one
function and fails for other functions.
It's a common FW for the entire adapter.



This patch changes the way temperature is set in be2net driver. At

anytime the fw query is performed, it will set

the temperature value for all functions of the adapter, instead of only

setting the temperature of the function

passed to be_worker().

If the possible inconsistency across functions is indeed a problem, then a
simpler solution would be to
issue the FW cmd synchronously when the sysfs attr is read, i.e., in
be_hwmon_show_temp() routine itself.



Hi Sathya, thanks very much for your quick reply. I agree with you that 
an 1 or 2 sec inconsistency wouldn't harm, but the main problem we're 
seeing is that be_worker() is being called with a single function as a 
parameter - in our case, the last function is being passed as argument 
to be_worker() multiple times in a row, and then we have its temperature 
updated but the other functions' temperature set as invalid.


Regarding the temperature update run on be_hwmon_show_temp(), it was an 
idea too, but I was afraid in delay this output too much - imagine some 
userspace tool reads hwmon attributes for all functions almost at "same 
time", supposing the fw command can't run in parallel, the "last" read 
would need to wait 4 fw commands to complete before showing it's output.
Besides, in a worse scenario, some "not-friendly" tool might issue lots 
of reads to hwmon per second then issuing lots of fw commands, which 
does not seem a good idea. Of course this last case we can avoid by 
implementing a counter or timer on be_hwmon_show_temp() to allow maximum 
number of fw cmds in a time frame.


I appreciate your advice on how do you prefer to address this issue.
Thanks,


Guilherme



thanks!
-Sathya





[PATCH net-next 2/2] be2net: query temperature on probe and decrease its frequency on be_worker()

2016-07-22 Thread Guilherme G. Piccoli
Currently the be2net driver queries the temperature from fw in regular
intervals on be_worker(), which is a delayed work procedure that
reschedules itself to run in next second. The interval for temperature
query is currently set to 64, meaning at each 64 seconds the query will
happen on be_worker().

This patch adds a temperature fw query on be_probe() and increase the
current query timeout of 64 to 128, lowering the frequency of the
temperature fw queries in be_worker() by half. In our experiments, the
probing time got increased by 8.7% (from 11.62s to 12.63s) for a
4-function Lancer adapter (PCI Vid:Did == 10df:e220).

This aims to solve the problem of userspace temperature request right after
be2net probing shows inconsistent values because be_worker() didn't perform
the first temperature query yet. Besides, this patch aims to reduce the
overhead of querying fw each minute for temperature information.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index 9f44a00..0e23ab2 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -5232,7 +5232,7 @@ static int be_drv_init(struct be_adapter *adapter)
adapter->tx_fc = true;
 
/* Must be a power of 2 or else MODULO will BUG_ON */
-   adapter->be_get_temp_freq = 64;
+   adapter->be_get_temp_freq = 128;
 
return 0;
 
@@ -5445,6 +5445,7 @@ static int be_probe(struct pci_dev *pdev, const struct 
pci_device_id *pdev_id)
   adapter,
   be_hwmon_groups);
adapter->hwmon_info.be_on_die_temp = BE_INVALID_DIE_TEMP;
+   be_cmd_get_die_temperature(adapter);
}
 
dev_info(>dev, "%s: %s %s port %c\n", nic_name(pdev),
-- 
2.1.0



[PATCH net-next 1/2] be2net: set temperature value for all adapter's functions

2016-07-22 Thread Guilherme G. Piccoli
Temperature values on be2net driver are made available to userspace via
hwmon abstraction, so tools like lm-sensors can present them to the user.
The driver provides hwmon structures for each adapter's function.
Nevertheless, the temperature information come from fw queries performed by
be_worker() with some frequency, and this procedure is called with a single
function as argument; this means that the temperature value is updated only
in the specific function that was passed to be_worker().

This can lead to incongruency in reported temperature by a function, or in
a worse scenario, some functions might be unable to provide temperature
info to userspace, if they weren't fed with this information from fw in
be_worker() run.

This patch changes the way temperature is set in be2net driver. At anytime
the fw query is performed, it will set the temperature value for all
functions of the adapter, instead of only setting the temperature of the
function passed to be_worker().

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/emulex/benet/be.h  |  1 +
 drivers/net/ethernet/emulex/benet/be_cmds.c | 13 +++---
 drivers/net/ethernet/emulex/benet/be_main.c | 63 +
 3 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h 
b/drivers/net/ethernet/emulex/benet/be.h
index fe3763d..76f8fdb 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -851,6 +851,7 @@ u32 be_get_fw_log_level(struct be_adapter *adapter);
 int be_update_queues(struct be_adapter *adapter);
 int be_poll(struct napi_struct *napi, int budget);
 void be_eqd_update(struct be_adapter *adapter, bool force_update);
+void be_set_adapters_temperature_value(struct be_adapter *adapter, u8 temp);
 
 /*
  * internal function to initialize-cleanup roce device.
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c 
b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 22402db..6c59351 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -221,13 +221,12 @@ static void be_async_cmd_process(struct be_adapter 
*adapter,
if (base_status == MCC_STATUS_SUCCESS) {
struct be_cmd_resp_get_cntl_addnl_attribs *resp =
(void *)resp_hdr;
-   adapter->hwmon_info.be_on_die_temp =
-   resp->on_die_temperature;
-   } else {
-   adapter->be_get_temp_freq = 0;
-   adapter->hwmon_info.be_on_die_temp =
-   BE_INVALID_DIE_TEMP;
-   }
+   be_set_adapters_temperature_value(adapter,
+ 
resp->on_die_temperature);
+   } else
+   be_set_adapters_temperature_value(adapter,
+ BE_INVALID_DIE_TEMP);
+
return;
}
 }
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index ed98ef1..9f44a00 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -53,6 +53,15 @@ static const struct pci_device_id be_dev_ids[] = {
{ 0 }
 };
 MODULE_DEVICE_TABLE(pci, be_dev_ids);
+
+struct adapter_list_node {
+   struct be_adapter *adapter;
+   struct list_head node;
+};
+
+static LIST_HEAD(adapters_list);
+static DEFINE_MUTEX(adapters_list_lock);
+
 /* UE Status Low CSR */
 static const char * const ue_status_low_desc[] = {
"CEV",
@@ -130,6 +139,40 @@ static const char * const ue_status_hi_desc[] = {
 BE_IF_FLAGS_MULTICAST | \
 BE_IF_FLAGS_PASS_L3L4_ERRORS)
 
+/* This procedure runs through adapters_list and sets the temperature for
+ * all functions of the same adapter. Since the temperature update is done
+ * by a single function in be_worker(), the other hwmon entries might remain
+ * with an invalid temperature.
+ */
+
+void be_set_adapters_temperature_value(struct be_adapter *adapter, u8 temp)
+{
+   struct adapter_list_node *adapter_lnode;
+   struct pci_bus *bus;
+   u8 bus_number, slot, dev;
+   u16 domain;
+
+   bus = adapter->pdev->bus;
+   domain = pci_domain_nr(bus);
+   bus_number = bus->number;
+   dev = PCI_SLOT(adapter->pdev->devfn);
+
+   mutex_lock(_list_lock);
+   list_for_each_entry(adapter_lnode, _list, node) {
+   bus = adapter_lnode->adapter->pdev->bus;
+   slot = PCI_SLOT(adapter_lnode->adapter->pdev->devfn);
+
+   if (pci_domain_nr(bus) == domain && bus->number == bus_number &&

Re: [PATCH net] i40e: use valid online CPU on q_vector initialization

2016-07-11 Thread Guilherme G. Piccoli

On 06/27/2016 12:16 PM, Guilherme G. Piccoli wrote:

Currently, the q_vector initialization routine sets the affinity_mask
of a q_vector based on v_idx value. Meaning a loop iterates on v_idx,
which is an incremental value, and the cpumask is created based on
this value.

This is a problem in systems with multiple logical CPUs per core (like in
SMT scenarios). If we disable some logical CPUs, by turning SMT off for
example, we will end up with a sparse cpu_online_mask, i.e., only the first
CPU in a core is online, and incremental filling in q_vector cpumask might
lead to multiple offline CPUs being assigned to q_vectors.

Example: if we have a system with 8 cores each one containing 8 logical
CPUs (SMT == 8 in this case), we have 64 CPUs in total. But if SMT is
disabled, only the 1st CPU in each core remains online, so the
cpu_online_mask in this case would have only 8 bits set, in a sparse way.

In general case, when SMT is off the cpu_online_mask has only C bits set:
0, 1*N, 2*N, ..., C*(N-1)  where
C == # of cores;
N == # of logical CPUs per core.
In our example, only bits 0, 8, 16, 24, 32, 40, 48, 56 would be set.

This patch changes the way q_vector's affinity_mask is created: it iterates
on v_idx, but consumes the CPU index from the cpu_online_mask instead of
just using the v_idx incremental value.

No functional changes were introduced.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
  drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++-
  1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5ea2200..a89bddd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7726,10 +7726,11 @@ static int i40e_init_msix(struct i40e_pf *pf)
   * i40e_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
   * @vsi: the VSI being configured
   * @v_idx: index of the vector in the vsi struct
+ * @cpu: cpu to be used on affinity_mask
   *
   * We allocate one q_vector.  If allocation fails we return -ENOMEM.
   **/
-static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx)
+static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu)
  {
struct i40e_q_vector *q_vector;

@@ -7740,7 +7741,8 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, 
int v_idx)

q_vector->vsi = vsi;
q_vector->v_idx = v_idx;
-   cpumask_set_cpu(v_idx, _vector->affinity_mask);
+   cpumask_set_cpu(cpu, _vector->affinity_mask);
+
if (vsi->netdev)
netif_napi_add(vsi->netdev, _vector->napi,
   i40e_napi_poll, NAPI_POLL_WEIGHT);
@@ -7764,8 +7766,7 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, 
int v_idx)
  static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi)
  {
struct i40e_pf *pf = vsi->back;
-   int v_idx, num_q_vectors;
-   int err;
+   int err, v_idx, num_q_vectors, current_cpu;

/* if not MSIX, give the one vector only to the LAN VSI */
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
@@ -7775,10 +7776,15 @@ static int i40e_vsi_alloc_q_vectors(struct i40e_vsi 
*vsi)
else
return -EINVAL;

+   current_cpu = cpumask_first(cpu_online_mask);
+
for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
-   err = i40e_vsi_alloc_q_vector(vsi, v_idx);
+   err = i40e_vsi_alloc_q_vector(vsi, v_idx, current_cpu);
if (err)
goto err_out;
+   current_cpu = cpumask_next(current_cpu, cpu_online_mask);
+   if (unlikely(current_cpu >= nr_cpu_ids))
+   current_cpu = cpumask_first(cpu_online_mask);
}

return 0;




Ping?

Sorry to bother, if you think I need to improve something here, let me 
know :)


I'm adding another Intel people in this thread, based on patches to i40e.

Thanks in advance,



Guilherme



[PATCH net] i40e: use valid online CPU on q_vector initialization

2016-06-27 Thread Guilherme G. Piccoli
Currently, the q_vector initialization routine sets the affinity_mask
of a q_vector based on v_idx value. Meaning a loop iterates on v_idx,
which is an incremental value, and the cpumask is created based on
this value.

This is a problem in systems with multiple logical CPUs per core (like in
SMT scenarios). If we disable some logical CPUs, by turning SMT off for
example, we will end up with a sparse cpu_online_mask, i.e., only the first
CPU in a core is online, and incremental filling in q_vector cpumask might
lead to multiple offline CPUs being assigned to q_vectors.

Example: if we have a system with 8 cores each one containing 8 logical
CPUs (SMT == 8 in this case), we have 64 CPUs in total. But if SMT is
disabled, only the 1st CPU in each core remains online, so the
cpu_online_mask in this case would have only 8 bits set, in a sparse way.

In general case, when SMT is off the cpu_online_mask has only C bits set:
0, 1*N, 2*N, ..., C*(N-1)  where
C == # of cores;
N == # of logical CPUs per core.
In our example, only bits 0, 8, 16, 24, 32, 40, 48, 56 would be set.

This patch changes the way q_vector's affinity_mask is created: it iterates
on v_idx, but consumes the CPU index from the cpu_online_mask instead of
just using the v_idx incremental value.

No functional changes were introduced.

Signed-off-by: Guilherme G. Piccoli <gpicc...@linux.vnet.ibm.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5ea2200..a89bddd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7726,10 +7726,11 @@ static int i40e_init_msix(struct i40e_pf *pf)
  * i40e_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
  * @vsi: the VSI being configured
  * @v_idx: index of the vector in the vsi struct
+ * @cpu: cpu to be used on affinity_mask
  *
  * We allocate one q_vector.  If allocation fails we return -ENOMEM.
  **/
-static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx)
+static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu)
 {
struct i40e_q_vector *q_vector;
 
@@ -7740,7 +7741,8 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, 
int v_idx)
 
q_vector->vsi = vsi;
q_vector->v_idx = v_idx;
-   cpumask_set_cpu(v_idx, _vector->affinity_mask);
+   cpumask_set_cpu(cpu, _vector->affinity_mask);
+
if (vsi->netdev)
netif_napi_add(vsi->netdev, _vector->napi,
   i40e_napi_poll, NAPI_POLL_WEIGHT);
@@ -7764,8 +7766,7 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, 
int v_idx)
 static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi)
 {
struct i40e_pf *pf = vsi->back;
-   int v_idx, num_q_vectors;
-   int err;
+   int err, v_idx, num_q_vectors, current_cpu;
 
/* if not MSIX, give the one vector only to the LAN VSI */
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
@@ -7775,10 +7776,15 @@ static int i40e_vsi_alloc_q_vectors(struct i40e_vsi 
*vsi)
else
return -EINVAL;
 
+   current_cpu = cpumask_first(cpu_online_mask);
+
for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
-   err = i40e_vsi_alloc_q_vector(vsi, v_idx);
+   err = i40e_vsi_alloc_q_vector(vsi, v_idx, current_cpu);
if (err)
goto err_out;
+   current_cpu = cpumask_next(current_cpu, cpu_online_mask);
+   if (unlikely(current_cpu >= nr_cpu_ids))
+   current_cpu = cpumask_first(cpu_online_mask);
}
 
return 0;
-- 
2.1.0