Re: [bug] forcedeth: hung interface under load

2007-04-06 Thread Ingo Molnar

* Ingo Molnar <[EMAIL PROTECTED]> wrote:

> > there's a different type of regression now: under high load i dont 
> > get a crash, i get a hung interface instead. No error packets or 
> > other weird interface state - just a hung interface. [...]
> 
> the interface stats do not change from that point on:
> 
> eth1  Link encap:Ethernet  HWaddr 00:13:D4:DC:41:12
>   inet addr:10.0.1.12  Bcast:10.0.1.255  Mask:255.255.255.0
>   UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>   RX packets:14976 errors:0 dropped:0 overruns:0 frame:0
>   TX packets:3928743 errors:0 dropped:0 overruns:0 carrier:0
>   collisions:0 txqueuelen:1000
>   RX bytes:1028544 (1004.4 KiB)  TX bytes:4126766510 (3.8 GiB)
>   Interrupt:16 Base address:0xa000
> 
> and the irq count does not change either:
> 
>  16:8163463148   IO-APIC-fasteoi   eth1
> 
> no matter what i do to the interface. So it's completely stuck. No 
> kernel messages either - apparently nv_tx_timeout() never triggered.

i've attached an ethtool dump, ifconfig output, interrupts output and 
lspci output of such a hang. Does the ethtool dump make any sense to 
you? The driver is -rc6 plus the changes below. (but the hang looks 
exactly the same that i got with an unmodified driver. the 
optimization_tweak is a new change too - it drastically improves the 
performance and scalability of the driver btw., by not letting it do 
100-200K irqs/sec (!).)

Ingo

>
Index: linux/drivers/net/forcedeth.c
===
--- linux.orig/drivers/net/forcedeth.c
+++ linux/drivers/net/forcedeth.c
@@ -800,7 +800,7 @@ struct fe_priv {
  * Maximum number of loops until we assume that a bit in the irq mask
  * is stuck. Overridable with module param.
  */
-static int max_interrupt_work = 5;
+static int max_interrupt_work = 50;
 
 /*
  * Optimization can be either throuput mode or cpu mode
@@ -812,7 +812,7 @@ enum {
NV_OPTIMIZATION_MODE_THROUGHPUT,
NV_OPTIMIZATION_MODE_CPU
 };
-static int optimization_mode = NV_OPTIMIZATION_MODE_THROUGHPUT;
+static int optimization_mode = NV_OPTIMIZATION_MODE_CPU;
 
 /*
  * Poll interval for timer irq
@@ -1902,6 +1902,11 @@ static void nv_tx_done(struct net_device
np->stats.tx_carrier_errors++;
np->stats.tx_errors++;
} else {
+   if (!np->get_tx_ctx->skb) {
+   printk("get_tx: %ld, put_tx: 
%ld\n", (long)(np->get_tx_ctx - np->first_tx_ctx), (long)(np->put_tx_ctx - 
np->first_tx_ctx));
+   WARN_ON(1);
+   break;
+   }
np->stats.tx_packets++;
np->stats.tx_bytes += 
np->get_tx_ctx->skb->len;
}
@@ -1917,6 +1922,11 @@ static void nv_tx_done(struct net_device
np->stats.tx_carrier_errors++;
np->stats.tx_errors++;
} else {
+   if (!np->get_tx_ctx->skb) {
+   printk("get_tx: %ld, put_tx: 
%ld\n", (long)(np->get_tx_ctx - np->first_tx_ctx), (long)(np->put_tx_ctx - 
np->first_tx_ctx));
+   WARN_ON(1);
+   break;
+   }
np->stats.tx_packets++;
np->stats.tx_bytes += 
np->get_tx_ctx->skb->len;
}
@@ -3108,9 +3118,17 @@ static int nv_napi_poll(struct net_devic
int retcode;
 
if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+   spin_lock_irqsave(>lock, flags);
+   nv_tx_done(dev);
+   spin_unlock_irqrestore(>lock, flags);
+
pkts = nv_rx_process(dev, limit);
retcode = nv_alloc_rx(dev);
} else {
+   spin_lock_irqsave(>lock, flags);
+   nv_tx_done_optimized(dev, np->tx_ring_size);
+   spin_unlock_irqrestore(>lock, flags);
+
pkts = nv_rx_process_optimized(dev, limit);
retcode = nv_alloc_rx_optimized(dev);
}
Offset  Value
-
00  0x72
01  0x00
02  0x00
03  0x00
04  0xe7
05  0x00
06  0x00
07  0x00
08  0x03
09  0x00
10  0x00
11  0x00
12  0x0d
13  0x00
14  0x08
15  0x00
16  0x00
17  0x00
18  0x00
19  0x00
20  0x00
21  0x00
22  0x00
23  0x00
24  0x00
25  0x00
26  

Re: [bug] forcedeth: hung interface under load

2007-04-06 Thread Ingo Molnar

* Ingo Molnar [EMAIL PROTECTED] wrote:

  there's a different type of regression now: under high load i dont 
  get a crash, i get a hung interface instead. No error packets or 
  other weird interface state - just a hung interface. [...]
 
 the interface stats do not change from that point on:
 
 eth1  Link encap:Ethernet  HWaddr 00:13:D4:DC:41:12
   inet addr:10.0.1.12  Bcast:10.0.1.255  Mask:255.255.255.0
   UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
   RX packets:14976 errors:0 dropped:0 overruns:0 frame:0
   TX packets:3928743 errors:0 dropped:0 overruns:0 carrier:0
   collisions:0 txqueuelen:1000
   RX bytes:1028544 (1004.4 KiB)  TX bytes:4126766510 (3.8 GiB)
   Interrupt:16 Base address:0xa000
 
 and the irq count does not change either:
 
  16:8163463148   IO-APIC-fasteoi   eth1
 
 no matter what i do to the interface. So it's completely stuck. No 
 kernel messages either - apparently nv_tx_timeout() never triggered.

i've attached an ethtool dump, ifconfig output, interrupts output and 
lspci output of such a hang. Does the ethtool dump make any sense to 
you? The driver is -rc6 plus the changes below. (but the hang looks 
exactly the same that i got with an unmodified driver. the 
optimization_tweak is a new change too - it drastically improves the 
performance and scalability of the driver btw., by not letting it do 
100-200K irqs/sec (!).)

Ingo


Index: linux/drivers/net/forcedeth.c
===
--- linux.orig/drivers/net/forcedeth.c
+++ linux/drivers/net/forcedeth.c
@@ -800,7 +800,7 @@ struct fe_priv {
  * Maximum number of loops until we assume that a bit in the irq mask
  * is stuck. Overridable with module param.
  */
-static int max_interrupt_work = 5;
+static int max_interrupt_work = 50;
 
 /*
  * Optimization can be either throuput mode or cpu mode
@@ -812,7 +812,7 @@ enum {
NV_OPTIMIZATION_MODE_THROUGHPUT,
NV_OPTIMIZATION_MODE_CPU
 };
-static int optimization_mode = NV_OPTIMIZATION_MODE_THROUGHPUT;
+static int optimization_mode = NV_OPTIMIZATION_MODE_CPU;
 
 /*
  * Poll interval for timer irq
@@ -1902,6 +1902,11 @@ static void nv_tx_done(struct net_device
np-stats.tx_carrier_errors++;
np-stats.tx_errors++;
} else {
+   if (!np-get_tx_ctx-skb) {
+   printk(get_tx: %ld, put_tx: 
%ld\n, (long)(np-get_tx_ctx - np-first_tx_ctx), (long)(np-put_tx_ctx - 
np-first_tx_ctx));
+   WARN_ON(1);
+   break;
+   }
np-stats.tx_packets++;
np-stats.tx_bytes += 
np-get_tx_ctx-skb-len;
}
@@ -1917,6 +1922,11 @@ static void nv_tx_done(struct net_device
np-stats.tx_carrier_errors++;
np-stats.tx_errors++;
} else {
+   if (!np-get_tx_ctx-skb) {
+   printk(get_tx: %ld, put_tx: 
%ld\n, (long)(np-get_tx_ctx - np-first_tx_ctx), (long)(np-put_tx_ctx - 
np-first_tx_ctx));
+   WARN_ON(1);
+   break;
+   }
np-stats.tx_packets++;
np-stats.tx_bytes += 
np-get_tx_ctx-skb-len;
}
@@ -3108,9 +3118,17 @@ static int nv_napi_poll(struct net_devic
int retcode;
 
if (np-desc_ver == DESC_VER_1 || np-desc_ver == DESC_VER_2) {
+   spin_lock_irqsave(np-lock, flags);
+   nv_tx_done(dev);
+   spin_unlock_irqrestore(np-lock, flags);
+
pkts = nv_rx_process(dev, limit);
retcode = nv_alloc_rx(dev);
} else {
+   spin_lock_irqsave(np-lock, flags);
+   nv_tx_done_optimized(dev, np-tx_ring_size);
+   spin_unlock_irqrestore(np-lock, flags);
+
pkts = nv_rx_process_optimized(dev, limit);
retcode = nv_alloc_rx_optimized(dev);
}
Offset  Value
-
00  0x72
01  0x00
02  0x00
03  0x00
04  0xe7
05  0x00
06  0x00
07  0x00
08  0x03
09  0x00
10  0x00
11  0x00
12  0x0d
13  0x00
14  0x08
15  0x00
16  0x00
17  0x00
18  0x00
19  0x00
20  0x00
21  0x00
22  0x00
23  0x00
24  0x00
25  0x00
26  0x00
27  0x00
28  0x00
29  0x00
30