[PATCH v2 0/3] macb rx filter cleanups

2017-12-05 Thread Julia Cartwright
Here's a proper patchset based on net-next.

v1 -> v2:
  - Rebased on net-next
  - Add Nicolas's Acks
  - Reorder commits, putting the list_empty() cleanups prior to the
others.
  - Added commit reverting the GFP_ATOMIC change.

Julia Cartwright (3):
  net: macb: kill useless use of list_empty()
  net: macb: reduce scope of rx_fs_lock-protected regions
  net: macb: change GFP_ATOMIC to GFP_KERNEL

 drivers/net/ethernet/cadence/macb_main.c | 47 
 1 file changed, 23 insertions(+), 24 deletions(-)

-- 
2.14.2



[PATCH v2 2/3] net: macb: reduce scope of rx_fs_lock-protected regions

2017-12-05 Thread Julia Cartwright
Commit ae8223de3df5 ("net: macb: Added support for RX filtering")
introduces a lock, rx_fs_lock which is intended to protect the list of
rx_flow items and synchronize access to the hardware rx filtering
registers.

However, the region protected by this lock is overscoped, unnecessarily
including things like slab allocation.  Reduce this lock scope to only
include operations which must be performed atomically: list traversal,
addition, and removal, and hitting the macb filtering registers.

This fixes the use of kmalloc w/ GFP_KERNEL in atomic context.

Fixes: ae8223de3df5 ("net: macb: Added support for RX filtering")
Cc: Rafal Ozieblo <raf...@cadence.com>
Cc: Julia Lawall <julia.law...@lip6.fr>
Acked-by: Nicolas Ferre <nicolas.fe...@microchip.com>
Signed-off-by: Julia Cartwright <ju...@ni.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index b7644836aba1..758e8b3042b2 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2796,6 +2796,7 @@ static int gem_add_flow_filter(struct net_device *netdev,
struct macb *bp = netdev_priv(netdev);
struct ethtool_rx_flow_spec *fs = >fs;
struct ethtool_rx_fs_item *item, *newfs;
+   unsigned long flags;
int ret = -EINVAL;
bool added = false;
 
@@ -2811,6 +2812,8 @@ static int gem_add_flow_filter(struct net_device *netdev,
htonl(fs->h_u.tcp_ip4_spec.ip4dst),
htons(fs->h_u.tcp_ip4_spec.psrc), 
htons(fs->h_u.tcp_ip4_spec.pdst));
 
+   spin_lock_irqsave(>rx_fs_lock, flags);
+
/* find correct place to add in list */
list_for_each_entry(item, >rx_fs_list.list, list) {
if (item->fs.location > newfs->fs.location) {
@@ -2833,9 +2836,11 @@ static int gem_add_flow_filter(struct net_device *netdev,
if (netdev->features & NETIF_F_NTUPLE)
gem_enable_flow_filters(bp, 1);
 
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
return 0;
 
 err:
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
kfree(newfs);
return ret;
 }
@@ -2846,6 +2851,9 @@ static int gem_del_flow_filter(struct net_device *netdev,
struct macb *bp = netdev_priv(netdev);
struct ethtool_rx_fs_item *item;
struct ethtool_rx_flow_spec *fs;
+   unsigned long flags;
+
+   spin_lock_irqsave(>rx_fs_lock, flags);
 
list_for_each_entry(item, >rx_fs_list.list, list) {
if (item->fs.location == cmd->fs.location) {
@@ -2862,12 +2870,14 @@ static int gem_del_flow_filter(struct net_device 
*netdev,
gem_writel_n(bp, SCRT2, fs->location, 0);
 
list_del(>list);
-   kfree(item);
bp->rx_fs_list.count--;
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
+   kfree(item);
return 0;
}
}
 
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
return -EINVAL;
 }
 
@@ -2936,11 +2946,8 @@ static int gem_get_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd,
 static int gem_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
 {
struct macb *bp = netdev_priv(netdev);
-   unsigned long flags;
int ret;
 
-   spin_lock_irqsave(>rx_fs_lock, flags);
-
switch (cmd->cmd) {
case ETHTOOL_SRXCLSRLINS:
if ((cmd->fs.location >= bp->max_tuples)
@@ -2959,7 +2966,6 @@ static int gem_set_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd)
ret = -EOPNOTSUPP;
}
 
-   spin_unlock_irqrestore(>rx_fs_lock, flags);
return ret;
 }
 
-- 
2.14.2



[PATCH v2 1/3] net: macb: kill useless use of list_empty()

2017-12-05 Thread Julia Cartwright
The list_for_each_entry() macro already handles the case where the list
is empty (by not executing the loop body).  It's not necessary to handle
this case specially, so stop doing so.

Cc: Rafal Ozieblo <raf...@cadence.com>
Acked-by: Nicolas Ferre <nicolas.fe...@microchip.com>
Signed-off-by: Julia Cartwright <ju...@ni.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 31 ---
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index ebfeab853bf4..b7644836aba1 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2812,24 +2812,20 @@ static int gem_add_flow_filter(struct net_device 
*netdev,
htons(fs->h_u.tcp_ip4_spec.psrc), 
htons(fs->h_u.tcp_ip4_spec.pdst));
 
/* find correct place to add in list */
-   if (list_empty(>rx_fs_list.list))
-   list_add(>list, >rx_fs_list.list);
-   else {
-   list_for_each_entry(item, >rx_fs_list.list, list) {
-   if (item->fs.location > newfs->fs.location) {
-   list_add_tail(>list, >list);
-   added = true;
-   break;
-   } else if (item->fs.location == fs->location) {
-   netdev_err(netdev, "Rule not added: location %d 
not free!\n",
-   fs->location);
-   ret = -EBUSY;
-   goto err;
-   }
+   list_for_each_entry(item, >rx_fs_list.list, list) {
+   if (item->fs.location > newfs->fs.location) {
+   list_add_tail(>list, >list);
+   added = true;
+   break;
+   } else if (item->fs.location == fs->location) {
+   netdev_err(netdev, "Rule not added: location %d not 
free!\n",
+   fs->location);
+   ret = -EBUSY;
+   goto err;
}
-   if (!added)
-   list_add_tail(>list, >rx_fs_list.list);
}
+   if (!added)
+   list_add_tail(>list, >rx_fs_list.list);
 
gem_prog_cmp_regs(bp, fs);
bp->rx_fs_list.count++;
@@ -2851,9 +2847,6 @@ static int gem_del_flow_filter(struct net_device *netdev,
struct ethtool_rx_fs_item *item;
struct ethtool_rx_flow_spec *fs;
 
-   if (list_empty(>rx_fs_list.list))
-   return -EINVAL;
-
list_for_each_entry(item, >rx_fs_list.list, list) {
if (item->fs.location == cmd->fs.location) {
/* disable screener regs for the flow entry */
-- 
2.14.2



[PATCH v2 3/3] net: macb: change GFP_ATOMIC to GFP_KERNEL

2017-12-05 Thread Julia Cartwright
Now that the rx_fs_lock is no longer held across allocation, it's safe
to use GFP_KERNEL for allocating new entries.

This reverts commit 81da3bf6e3f88 ("net: macb: change GFP_KERNEL to
GFP_ATOMIC").

Cc: Julia Lawall <julia.law...@lip6.fr>
Signed-off-by: Julia Cartwright <ju...@ni.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index 758e8b3042b2..234667eaaa92 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2800,7 +2800,7 @@ static int gem_add_flow_filter(struct net_device *netdev,
int ret = -EINVAL;
bool added = false;
 
-   newfs = kmalloc(sizeof(*newfs), GFP_ATOMIC);
+   newfs = kmalloc(sizeof(*newfs), GFP_KERNEL);
if (newfs == NULL)
return -ENOMEM;
memcpy(>fs, fs, sizeof(newfs->fs));
-- 
2.14.2



[PATCH 1/2] net: macb: reduce scope of rx_fs_lock-protected regions

2017-12-05 Thread Julia Cartwright
Commit ae8223de3df5 ("net: macb: Added support for RX filtering")
introduces a lock, rx_fs_lock which is intended to protect the list of
rx_flow items and synchronize access to the hardware rx filtering
registers.

However, the region protected by this lock is overscoped, unnecessarily
including things like slab allocation.  Reduce this lock scope to only
include operations which must be performed atomically: list traversal,
addition, and removal, and hitting the macb filtering registers.

This fixes the use of kmalloc w/ GFP_KERNEL in atomic context.

Fixes: ae8223de3df5 ("net: macb: Added support for RX filtering")
Cc: Rafal Ozieblo <raf...@cadence.com>
Cc: Julia Lawall <julia.law...@lip6.fr>
Signed-off-by: Julia Cartwright <ju...@ni.com>
---
While Julia Lawall's cocci-generated patch fixes the problem, the right
solution is to obviate the problem altogether.

Thanks,
   The Other Julia

 drivers/net/ethernet/cadence/macb_main.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index c5fa87cdc6c4..e7ef104a077d 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2796,6 +2796,7 @@ static int gem_add_flow_filter(struct net_device *netdev,
struct macb *bp = netdev_priv(netdev);
struct ethtool_rx_flow_spec *fs = >fs;
struct ethtool_rx_fs_item *item, *newfs;
+   unsigned long flags;
int ret = -EINVAL;
bool added = false;
 
@@ -2811,6 +2812,8 @@ static int gem_add_flow_filter(struct net_device *netdev,
htonl(fs->h_u.tcp_ip4_spec.ip4dst),
htons(fs->h_u.tcp_ip4_spec.psrc), 
htons(fs->h_u.tcp_ip4_spec.pdst));
 
+   spin_lock_irqsave(>rx_fs_lock, flags);
+
/* find correct place to add in list */
if (list_empty(>rx_fs_list.list))
list_add(>list, >rx_fs_list.list);
@@ -2837,9 +2840,11 @@ static int gem_add_flow_filter(struct net_device *netdev,
if (netdev->features & NETIF_F_NTUPLE)
gem_enable_flow_filters(bp, 1);
 
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
return 0;
 
 err:
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
kfree(newfs);
return ret;
 }
@@ -2850,9 +2855,14 @@ static int gem_del_flow_filter(struct net_device *netdev,
struct macb *bp = netdev_priv(netdev);
struct ethtool_rx_fs_item *item;
struct ethtool_rx_flow_spec *fs;
+   unsigned long flags;
 
-   if (list_empty(>rx_fs_list.list))
+   spin_lock_irqsave(>rx_fs_lock, flags);
+
+   if (list_empty(>rx_fs_list.list)) {
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
return -EINVAL;
+   }
 
list_for_each_entry(item, >rx_fs_list.list, list) {
if (item->fs.location == cmd->fs.location) {
@@ -2869,12 +2879,14 @@ static int gem_del_flow_filter(struct net_device 
*netdev,
gem_writel_n(bp, SCRT2, fs->location, 0);
 
list_del(>list);
-   kfree(item);
bp->rx_fs_list.count--;
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
+   kfree(item);
return 0;
}
}
 
+   spin_unlock_irqrestore(>rx_fs_lock, flags);
return -EINVAL;
 }
 
@@ -2943,11 +2955,8 @@ static int gem_get_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd,
 static int gem_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
 {
struct macb *bp = netdev_priv(netdev);
-   unsigned long flags;
int ret;
 
-   spin_lock_irqsave(>rx_fs_lock, flags);
-
switch (cmd->cmd) {
case ETHTOOL_SRXCLSRLINS:
if ((cmd->fs.location >= bp->max_tuples)
@@ -2966,7 +2975,6 @@ static int gem_set_rxnfc(struct net_device *netdev, 
struct ethtool_rxnfc *cmd)
ret = -EOPNOTSUPP;
}
 
-   spin_unlock_irqrestore(>rx_fs_lock, flags);
return ret;
 }
 
-- 
2.14.2


Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-17 Thread Julia Cartwright
On Sat, Oct 15, 2016 at 12:06:33AM +0200, Richard Cochran wrote:
> On Fri, Oct 14, 2016 at 08:58:22AM +, Koehrer Mathias (ETAS/ESW5) wrote:
> > @@ -753,7 +756,9 @@ u32 igb_rd32(struct e1000_hw *hw, u32 re
> > if (E1000_REMOVED(hw_addr))
> > return ~value;
> >  
> > +trace_igb(801);
> > value = readl(_addr[reg]);
> > +trace_igb(802);
> 
> Nothing prevents this code from being preempted between the two trace
> points, and so you can't be sure whether the time delta in the trace
> is caused by the PCIe read stalling or not.

While that is certainly the case, and would explain the most egregious
of measured latency spikes, it doesn't invalidate the test if you
consider the valuable data point(s) to be the minimum and/or median
latencies.

   Julia


Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-17 Thread Julia Cartwright
+linux-pci

On Mon, Oct 17, 2016 at 08:39:40AM -0700, Alexander Duyck wrote:
> On Mon, Oct 17, 2016 at 8:00 AM, Koehrer Mathias (ETAS/ESW5)
>  wrote:
> > Hi Julia!
> >> > > Have you tested on a vanilla (non-RT) kernel?  I doubt there is
> >> > > anything RT specific about what you are seeing, but it might be nice
> >> > > to get confirmation.  Also, bisection would probably be easier if you 
> >> > > confirm on a
> >> vanilla kernel.
> >> > >
> >> > > I find it unlikely that it's a kernel config option that changed
> >> > > which regressed you, but instead was a code change to a driver.
> >> > > Which driver is now the question, and the surface area is still big
> >> > > (processor mapping attributes for this region, PCI root complex 
> >> > > configuration,
> >> PCI brige configuration, igb driver itself, etc.).
> >> > >
> >> > > Big enough that I'd recommend a bisection.  It looks like a
> >> > > bisection between 3.18 and 4.8 would take you about 18 tries to narrow 
> >> > > down,
> >> assuming all goes well.
> >> > >
> >> >
> >> > I have now repeated my tests using the vanilla kernel.
> >> > There I got the very same issue.
> >> > Using kernel 4.0 is fine, however starting with kernel 4.1, the issue 
> >> > appears.
> >>
> >> Great, thanks for confirming!  That helps narrow things down quite a bit.
> >>
> >> > Here is my exact (reproducible) test description:
> >> > I applied the following patch to the kernel to get the igb trace.
> >> > This patch instruments the igb_rd32() function to measure the call to
> >> > readl() which is used to access registers of the igb NIC.
> >>
> >> I took your test setup and ran it between 4.0 and 4.1 on the hardware on 
> >> my desk,
> >> which is an Atom-based board with dual I210s, however I didn't see much
> >> difference.
> >>
> >> However, it's a fairly simple board, with a much simpler PCI topology than 
> >> your
> >> workstation.  I'll see if I can find some other hardware to test on.
> >>
> >> [..]
> >> > This means, that I think that some other stuff in kernel 4.1 has
> >> > changed, which has impact on the igb accesses.
> >> >
> >> > Any idea what component could cause this kind of issue?
> >>
> >> Can you continue your bisection using 'git bisect'?  You've already 
> >> narrowed it down
> >> between 4.0 and 4.1, so you're well on your way.
> >>
> >
> > OK - done.
> > And finally I was successful!
> > The following git commit is the one that is causing the trouble!
> > (The full commit is in the attachment).
> > + BEGIN +++
> > commit 387d37577fdd05e9472c20885464c2a53b3c945f
> > Author: Matthew Garrett 
> > Date:   Tue Apr 7 11:07:00 2015 -0700
> >
> > PCI: Don't clear ASPM bits when the FADT declares it's unsupported
> >
> > Communications with a hardware vendor confirm that the expected 
> > behaviour
> > on systems that set the FADT ASPM disable bit but which still grant full
> > PCIe control is for the OS to leave any BIOS configuration intact and
> > refuse to touch the ASPM bits.  This mimics the behaviour of Windows.
> >
> > Signed-off-by: Matthew Garrett 
> > Signed-off-by: Bjorn Helgaas 
> > + HEADER +++
> >
> > The only files that are modified by this commit are
> > drivers/acpi/pci_root.c
> > drivers/pci/pcie/aspm.c
> > include/linux/pci-aspm.h
> >
> > This is all generic PCIe stuff - however I do not really understand what
> > the changes of the commit are...
> >
> > In my setup I am using a dual port igb Ethernet adapter.
> > This has an onboard PCIe switch and it might be that the configuration of 
> > this
> > PCIe switch on the Intel board is causing the trouble.
> >
> > Please see also the output of "lspci -v" in the attachment.
> > The relevant PCI address of the NIC is 04:00.0 / 04:00.1
> >
> > Any feedback on this is welcome!
> >
> > Thanks
> >
> > Mathias
>
> Hi Mathias,
>
> If you could set the output of lspci -vvv it might be more useful as
> most of the configuration data isn't present in the lspci dump you had
> attached.  Specifically if you could do this for the working case and
> the non-working case we could verify if this issue is actually due to
> the ASPM configuration on the device.
>
> Also one thing you might try is booting your kernel with the kernel
> parameter "pcie_aspm=off".  It sounds like the extra latency is likely
> due to your platform enabling ASPM on the device and this in turn will
> add latency if the PCIe link is disabled when you attempt to perform a
> read as it takes some time to bring the PCIe link up when in L1 state.

So if we assume it's this commit causing the regression, then it's safe
to assume that this system's BIOS is claiming to not support ASPM in the
FADT, but the BIOS is leaving ASPM configured in some way on the
relevant devices.

Also, unfortunately, taking a look at the code which handles

Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-14 Thread Julia Cartwright
On Fri, Oct 14, 2016 at 08:58:22AM +, Koehrer Mathias (ETAS/ESW5) wrote:
> Hi Julia,
>
> > Have you tested on a vanilla (non-RT) kernel?  I doubt there is anything RT 
> > specific
> > about what you are seeing, but it might be nice to get confirmation.  Also, 
> > bisection
> > would probably be easier if you confirm on a vanilla kernel.
> >
> > I find it unlikely that it's a kernel config option that changed which 
> > regressed you, but
> > instead was a code change to a driver.  Which driver is now the question, 
> > and the
> > surface area is still big (processor mapping attributes for this region, 
> > PCI root
> > complex configuration, PCI brige configuration, igb driver itself, etc.).
> >
> > Big enough that I'd recommend a bisection.  It looks like a bisection 
> > between 3.18
> > and 4.8 would take you about 18 tries to narrow down, assuming all goes 
> > well.
> >
>
> I have now repeated my tests using the vanilla kernel.
> There I got the very same issue.
> Using kernel 4.0 is fine, however starting with kernel 4.1, the issue appears.

Great, thanks for confirming!  That helps narrow things down quite a
bit.

> Here is my exact (reproducible) test description:
> I applied the following patch to the kernel to get the igb trace.
> This patch instruments the igb_rd32() function to measure the call
> to readl() which is used to access registers of the igb NIC.

I took your test setup and ran it between 4.0 and 4.1 on the hardware on
my desk, which is an Atom-based board with dual I210s, however I didn't
see much difference.

However, it's a fairly simple board, with a much simpler PCI topology
than your workstation.  I'll see if I can find some other hardware to
test on.

[..]
> This means, that I think that some other stuff in kernel 4.1 has changed,
> which has impact on the igb accesses.
>
> Any idea what component could cause this kind of issue?

Can you continue your bisection using 'git bisect'?  You've already
narrowed it down between 4.0 and 4.1, so you're well on your way.

Another option might be to try to eliminate igb from the picture as
well, and try reading from another device from the same (or, perhaps
nearest) bus segment, and see if you see the same results.

   Julia


Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-13 Thread Julia Cartwright
Hey Mathias-

On Thu, Oct 13, 2016 at 10:57:18AM +, Koehrer Mathias (ETAS/ESW5) wrote:
[..]

Interesting indeed!

> > Here are some places where I added traces:
> > In file igb_ptp.c:
> > void igb_ptp_rx_hang(struct igb_adapter *adapter) {
> > struct e1000_hw *hw = >hw;
> > unsigned long rx_event;
> > u32 tsyncrxctl;
> > trace_igb(700);
> > tsyncrxctl = rd32(E1000_TSYNCRXCTL);
> > trace_igb(701);
> >
> > /* Other hardware uses per-packet timestamps */
> > if (hw->mac.type != e1000_82576)
> > return;
> > ...
> >
> > In file igb_main.c:
> > static void igb_check_lvmmc(struct igb_adapter *adapter) {
> > struct e1000_hw *hw = >hw;
> > u32 lvmmc;
> >
> > trace_igb(600);
> > lvmmc = rd32(E1000_LVMMC);
> > trace_igb(601);
> > if (lvmmc) {
> > ...
> >
[..]
> > The time between my trace points 700 and 701 is about 30us, the time 
> > between my
> > trace points 600 and 601 is even 37us!!
> >
> > The code in between is
> >  tsyncrxctl = rd32(E1000_TSYNCRXCTL); resp.
> >  lvmmc = rd32(E1000_LVMMC);
> >
> > In both cases this is a single read from a register.
> >
> > I have no idea why this single read could take that much time!

Are these the only registers you see this amount of delay when reading?

It's also possible that it's not these registers themselves that cause
problems, but any writes prior to these reads.  That is, given to PCI's
posted write behavior, it could be that these reads are delayed only
because it's flushing previously writes to the device.

> > Is it possible that the igb hardware is in a state that delays the read 
> > access and this is
> > why the whole I/O system might be delayed?

One additional hypothesis is that some register accesses trigger
accesses to off-chip resources synchronously; for example, a write to
enable timestamping needs to access an external phy on a slower bus,
etc.  I don't know enough about this device to say whether or not that
happens or not.

> To have a proper comparison, I did the same with kernel 3.18.27-rt27.
> Also here, I instrumented the igb driver to get traces for the rd32 calls.
> However, here everything is generally much faster!

> In the idle system the maximum I got for a read was about 6us, most times it 
> was 1-2us.
> On the 4.8 kernel this is always much slower (see above).
> My question is now: Is there any kernel config option that has been 
> introduced in the meantime
> that may lead to this effect and which is not set in my 4.8 config?

Have you tested on a vanilla (non-RT) kernel?  I doubt there is anything
RT specific about what you are seeing, but it might be nice to get
confirmation.  Also, bisection would probably be easier if you confirm
on a vanilla kernel.

I find it unlikely that it's a kernel config option that changed which
regressed you, but instead was a code change to a driver.  Which driver
is now the question, and the surface area is still big (processor
mapping attributes for this region, PCI root complex configuration, PCI
brige configuration, igb driver itself, etc.).

Big enough that I'd recommend a bisection.  It looks like a bisection
between 3.18 and 4.8 would take you about 18 tries to narrow down,
assuming all goes well.

   Julia


Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-10 Thread Julia Cartwright
Hello Mathias-

On Fri, Oct 07, 2016 at 08:58:08AM +, Koehrer Mathias (ETAS/ESW5) wrote:
[..]
> I modified the in-kernel's igb_main.c (function igb_watchdog_task) to comment 
> out
> the access to the EICS registers:
> 
> --- igb_main.c.orig 2016-10-07 10:43:37.855873754 +0200
> +++ igb_main.c  2016-10-07 10:31:51.451346292 +0200
> @@ -4613,6 +4613,7 @@ no_wait:
> }
> 
> /* Cause software interrupt to ensure Rx ring is cleaned */
> +#if 0
> if (adapter->flags & IGB_FLAG_HAS_MSIX) {
> u32 eics = 0;
> 
> @@ -4622,6 +4623,7 @@ no_wait:
> } else {
> wr32(E1000_ICS, E1000_ICS_RXDMT0);
> }
> +#endif
> 
> igb_spoof_check(adapter);
> igb_ptp_rx_hang(adapter);
> 
> 
> The result is now slighty better, however the max latency that has been 
> measured by
> cyclictest is still much higher compared to the values of kernel 3.18.27-rt27.
> I have now enabled all events, the execution of 
> # cyclictest -a -i 105 -m -n -p 80 -t 1  -b 30 -C
> delivers the following trace values

There is something still fishy with these traces...

> cyclicte-10062   0... 10025813402us : sys_exit: NR 230 = 0
> cyclicte-10062   0... 10025813402us : sys_enter: NR 230 (1, 1, 
> 773ff930, 0, 0, 2bd7e12e)
> cyclicte-10062   0... 10025813402us : hrtimer_init: 
> hrtimer=88040a017e18 clockid=CLOCK_MONOTONIC mode=HRTIMER_MODE_ABS
> cyclicte-10062   0d...1.. 10025813403us : hrtimer_start: 
> hrtimer=88040a017e18 function=hrtimer_wakeup expires=10024735653388 
> softexpires=10024735653388

cyclictest thread sleeps waiting for wakeup at 10024735653388.

> cyclicte-10062   0d...1.. 10025813403us : write_msr: 6e0, value 20ca630b9aef
> cyclicte-10062   0d...1.. 10025813403us : rcu_utilization: Start context 
> switch
> cyclicte-10062   0d...1.. 10025813403us : rcu_utilization: End context switch
> cyclicte-10062   0d...2.. 10025813404us : sched_switch: prev_comm=cyclictest 
> prev_pid=10062 prev_prio=19 prev_state=S ==> next_comm=kworker/0:3 
> next_pid=1388 next_prio=120
> cyclicte-10062   0d...2.. 10025813404us+: x86_fpu_regs_deactivated: x86/fpu: 
> 88040c603ec0 fpregs_active: 0 fpstate_active: 1 counter: 69 xfeatures: 2 
> xcomp_bv: 0
> kworker/-13880d..h... 10025813468us : irq_handler_entry: irq=48 
> name=eth2-tx-0
> kworker/-13880d..h... 10025813468us : irq_handler_exit: irq=48 ret=handled
> kworker/-13880d..h1.. 10025813469us : sched_waking: comm=irq/48-eth2-tx- 
> pid=10057 prio=49 target_cpu=000
> kworker/-13880dN.h2.. 10025813469us : sched_wakeup: comm=irq/48-eth2-tx- 
> pid=10057 prio=49 target_cpu=000
> kworker/-13880dN.h1.. 10025813470us : irq_handler_entry: irq=47 
> name=eth2-rx-0
> kworker/-13880dN.h1.. 10025813470us : irq_handler_exit: irq=47 ret=handled
> kworker/-13880dN.h2.. 10025813471us : sched_waking: comm=irq/47-eth2-rx- 
> pid=10056 prio=49 target_cpu=000
> kworker/-13880dN.h3.. 10025813471us : sched_wakeup: comm=irq/47-eth2-rx- 
> pid=10056 prio=49 target_cpu=000

So, kworker/0:3 was busy doing something, and we received both tx and rx
interrupts from eth2, waking up the relevant threads.

> kworker/-13880dN..1.. 10025813472us : rcu_utilization: Start context 
> switch
> kworker/-13880dN..1.. 10025813472us : rcu_utilization: End context switch
> kworker/-13880dN..2.. 10025813472us : sched_stat_runtime: 
> comm=kworker/0:3 pid=1388 runtime=67566 [ns] vruntime=101216288332 [ns]
> kworker/-13880d...2.. 10025813472us : sched_switch: prev_comm=kworker/0:3 
> prev_pid=1388 prev_prio=120 prev_state=R+ ==> next_comm=irq/48-eth2-tx- 
> next_pid=10057 next_prio=49
> irq/48-e-10057   0d11 10025813473us : softirq_raise: vec=3 [action=NET_RX]
> irq/48-e-10057   0.12 10025813474us : softirq_entry: vec=3 [action=NET_RX]
> irq/48-e-10057   0.12 10025813475us : napi_poll: napi poll on napi struct 
> 88040a582850 for device eth2 work 0 budget 64

Hmm, the irq/48-eth2-tx- thread is raising NET_RX?  That seems...wrong.

> irq/48-e-10057   0.12 10025813475us : softirq_exit: vec=3 [action=NET_RX]
> irq/48-e-10057   0d...1.. 10025813475us : rcu_utilization: Start context 
> switch
> irq/48-e-10057   0d...1.. 10025813476us : rcu_utilization: End context switch
> irq/48-e-10057   0d...2.. 10025813476us : sched_switch: 
> prev_comm=irq/48-eth2-tx- prev_pid=10057 prev_prio=49 prev_state=S ==> 
> next_comm=irq/47-eth2-rx- next_pid=10056 next_prio=49
> irq/47-e-10056   0d11 10025813477us : softirq_raise: vec=3 [action=NET_RX]
> irq/47-e-10056   0.12 10025813477us : softirq_entry: vec=3 [action=NET_RX]
> irq/47-e-10056   0.12 10025813478us : napi_poll: napi poll on napi struct 
> 88040a580850 for device eth2 work 0 budget 64
> irq/47-e-10056   0.12 10025813478us : softirq_exit: vec=3 [action=NET_RX]

Oh, then we do the same thing again?  Hrmph...why?

> irq/47-e-10056   0d...1.. 10025813479us : rcu_utilization: Start context 
> switch
> 

Re: Kernel 4.6.7-rt13: Intel Ethernet driver igb causes huge latencies in cyclictest

2016-10-05 Thread Julia Cartwright
On Wed, Oct 05, 2016 at 07:02:21AM +, Koehrer Mathias (ETAS/ESW5) wrote:
> Hi Julia,
> 
> > > In the meanwhile I have detected another finding which might be relevant:
> > >
> > > With the 3.18 kernel the igb driver comes with two interrupts per
> > > NIC (e.g. eth2 and eth2-TxRx0) with the 4.6. kernel the igb driver
> > > comes with 9 (!) interrupts per NIC: eth2, and eth2-TxRx-0,
> > > eth2-TxRx-1, ... , eth2-TxRx-7.
> > >
> > > As I have used initially the same kernel configuration from 3.18 also
> > > for the 4.6. kernel I wonder where this comes from and if there is any
> > > kernel option I may use to disable these many interrupts and to reduce
> > > it to 2 again.
> > 
> > If it's all of these interrupts that are firing and being handled at the 
> > same time, that
> > can account for the latencies you were seeing.  As I suggested before, 
> > having a
> > trace with the sched_wakeup event enabled can help confirm that it's these
> > interrupts causing problems.
> > 
> > If it is true, then the question is: why is the device triggering all of 
> > these interrupts all
> > at once?  Is that expected?  These are questions for netdev folks, I think.
> > 
> >Julia
> 
> OK - I ran again the cyclictest. This time I used the -C option:
> # cyclictest -a -i 100 -m -n -p 80 -t 1 -b 21 -C
> 
> And the last output lines of the trace are:
> cyclicte-58870d...2.. 1504647266us!: sched_switch: prev_comm=cyclictest 
> prev_pid=5887 prev_prio=19 prev_state=S ==> next_comm=kworker/0:0 next_pid=5 
> next_prio=120
> kworker/-5   0dN.h2.. 1504647372us : sched_wakeup: comm=cyclictest 
> pid=5887 prio=19 target_cpu=000
> kworker/-5   0dN.h3.. 1504647374us : sched_wakeup: comm=irq/54-eth2-TxR 
> pid=5883 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647375us : sched_wakeup: comm=irq/53-eth2-TxR 
> pid=5882 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647377us : sched_wakeup: comm=irq/52-eth2-TxR 
> pid=5881 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647378us : sched_wakeup: comm=irq/51-eth2-TxR 
> pid=5880 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647380us : sched_wakeup: comm=irq/50-eth2-TxR 
> pid=5879 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647381us : sched_wakeup: comm=irq/49-eth2-TxR 
> pid=5878 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647382us : sched_wakeup: comm=irq/48-eth2-TxR 
> pid=5877 prio=49 target_cpu=000
> kworker/-5   0dN.h3.. 1504647383us : sched_wakeup: comm=irq/47-eth2-TxR 
> pid=5876 prio=49 target_cpu=000
> kworker/-5   0d...2.. 1504647384us : sched_switch: prev_comm=kworker/0:0 
> prev_pid=5 prev_prio=120 prev_state=R+ ==> next_comm=cyclictest next_pid=5887 
> next_prio=19
> cyclicte-58870.11 1504647389us : tracing_mark_write: hit latency 
> threshold (28 > 21)
> 
> The attached trace-extract.gz shows some more lines.
> It actually looks to me as if the the many irq threads from igb are causing 
> the issue.

Yes, I think so.

Although, to be clear, it isn't the fact that there exists 8 threads,
it's that the device is firing all 8 interrupts at the same time.  The
time spent in hardirq context just waking up all 8 of those threads (and
the cyclictest wakeup) is enough to cause your regression.

netdev/igb folks-

Under what conditions should it be expected that the i350 trigger all of
the TxRx interrupts simultaneously?  Any ideas here?

See the start of this thread here:

  
http://lkml.kernel.org/r/d648628329bc446fa63b5e19d4d3f...@fe-mbx1012.de.bosch.com

   Julia