I added a separate flag to keep track of whether a neighbour struct
is on the timer list or not. I also added some logic to dump the
stack if the add-timer method was called while we are already on
the timer list.
This dump-stack method is being called quite often in my test that
used to reproduce the netdev hang, but the system has not hung yet.
Could a double-add to the timer list be the problem?
Here is the first error:
ERROR: trying to add timer, but already there, neigh: d5884780
[<c02cce45>] neigh_add_timer+0x75/0x80
[<c02ce077>] neigh_timer_handler+0x97/0x2b0
[<c012a3f2>] run_timer_softirq+0xe2/0x1b0
[<c02cdfe0>] neigh_timer_handler+0x0/0x2b0
[<c012a3f2>] run_timer_softirq+0xe2/0x1b0
[<c011c1ef>] rebalance_tick+0xff/0x110
[<c0126309>] __do_softirq+0xd9/0xf0
[<c012636a>] do_softirq+0x4a/0x50
[<c0126464>] irq_exit+0x44/0x50
[<c0103a40>] apic_timer_interrupt+0x1c/0x24
[<c0100f93>] mwait_idle+0x33/0x50
[<c02104a7>] acpi_processor_idle+0x0/0x2ae
[<c02105ad>] acpi_processor_idle+0x106/0x2ae
[<c02104a7>] acpi_processor_idle+0x0/0x2ae
[<c0100db1>] cpu_idle+0x61/0x80
[<c046491c>] start_kernel+0x18c/0x1d0
[<c0464360>] unknown_bootoption+0x0/0x1c0
Here is the patch, on top of the rest of my debugging code.
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -124,7 +124,7 @@ struct neighbour
struct neighbour *next;
struct neigh_table *tbl;
struct neigh_parms *parms;
- struct net_device *dev;
+ struct net_device *dev;
unsigned long used;
unsigned long confirmed;
unsigned long updated;
@@ -136,10 +136,11 @@ struct neighbour
rwlock_t lock;
unsigned char ha[(MAX_ADDR_LEN+sizeof(unsigned
long)-1)&~(sizeof(unsigned long)-1)];
struct hh_cache *hh;
- atomic_t __refcnt; /* modify through neigh_put,
neigh_hold */
+ atomic_t __refcnt; /* modify through neigh_release,
neigh_hold */
int (*output)(struct sk_buff *skb);
struct sk_buff_head arp_queue;
struct timer_list timer;
+ int in_timer; /* boolean */
struct neigh_ops *ops;
u8 primary_key[0];
};
diff --git a/net/atm/clip.c b/net/atm/clip.c
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -146,11 +146,11 @@ static int neigh_check_cb(struct neighbo
if (entry->vccs || time_before(jiffies, entry->expires))
return 0;
- if (atomic_read(&n->refcnt) > 1) {
+ if (atomic_read(&n->__refcnt) > 1) {
struct sk_buff *skb;
DPRINTK("destruction postponed with ref %d\n",
- atomic_read(&n->refcnt));
+ atomic_read(&n->__refcnt));
while ((skb = skb_dequeue(&n->arp_queue)) != NULL)
dev_kfree_skb(skb);
@@ -539,7 +539,7 @@ static int clip_setentry(struct atm_vcc
}
error = ip_route_output_key(&rt,&fl);
if (error) return error;
- neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1);
+ neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1, NDRK_GENERIC);
ip_rt_put(rt);
if (!neigh)
return -ENOMEM;
@@ -554,7 +554,7 @@ static int clip_setentry(struct atm_vcc
}
error = neigh_update(neigh, llc_oui, NUD_PERMANENT,
NEIGH_UPDATE_F_OVERRIDE|NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
+ neigh_release(neigh, NDRK_GENERIC);
return error;
}
@@ -855,7 +855,7 @@ static void atmarp_info(struct seq_file
seq_printf(seq, "(resolving)\n");
else
seq_printf(seq, "(expired, ref %d)\n",
- atomic_read(&entry->neigh->refcnt));
+ atomic_read(&entry->neigh->__refcnt));
} else if (!svc) {
seq_printf(seq, "%d.%d.%d\n",
clip_vcc->vcc->dev->number,
diff --git a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -593,8 +593,8 @@ static void dump_rfcnt_info(struct net_d
n->flags, n->nud_state,
n->type,
n->dead,
atomic_read(&n->probes),
n->hh,
atomic_read(&n->__refcnt));
- printk(" output: %p ops:
%p\n",
- n->output, n->ops);
+ printk(" output: %p ops: %p in_timer:
%d\n",+ n->output, n->ops,
n->in_timer);
dump_neigh_rfcnt_info(n);
debug_locate_neigh(n);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -157,11 +157,31 @@ static int neigh_forced_gc(struct neigh_
return shrunk;
}
-static int neigh_del_timer(struct neighbour *n)
-{
- if ((n->nud_state & NUD_IN_TIMER) &&
- del_timer(&n->timer)) {
- neigh_release(n, NDRK_NEIGH_TIMER);
+static int neigh_del_timer(struct neighbour *n) {
+ if (n->in_timer) {
+ if (!n->nud_state & NUD_IN_TIMER) {
+ printk("ERROR: in_timer true, but NUD_IN_TIMER false,
neigh: %p\n", n);
+ dump_stack();
+ }
+ if (del_timer(&n->timer)) {
+ n->in_timer = 0;
+ neigh_release(n, NDRK_NEIGH_TIMER);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int neigh_add_timer(struct neighbour *n, u32 expires) {
+ if (n->in_timer) {
+ printk("ERROR: trying to add timer, but already there, neigh:
%p\n", n);
+ dump_stack();
+ }
+ else {
+ n->timer.expires = expires;
+ neigh_hold(n, NDRK_NEIGH_TIMER);
+ add_timer(&n->timer);
+ n->in_timer = 1;
return 1;
}
return 0;
@@ -748,6 +768,9 @@ static void neigh_timer_handler(unsigned
write_lock(&neigh->lock);
+ /* We are out of the timer list at this point. */
+ neigh->in_timer = 0;
+
state = neigh->nud_state;
now = jiffies;
next = now + HZ;
@@ -817,12 +840,9 @@ static void neigh_timer_handler(unsigned
}
if (neigh->nud_state & NUD_IN_TIMER) {
- neigh_hold(neigh, NDRK_NEIGH_TIMER);
if (time_before(next, jiffies + HZ/2))
next = jiffies + HZ/2;
- neigh->timer.expires = next;
- neigh->nud_state |= NUD_IN_TIMER;
- add_timer(&neigh->timer);
+ neigh_add_timer(neigh, next);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
struct sk_buff *skb = skb_peek(&neigh->arp_queue);
@@ -863,10 +883,7 @@ int __neigh_event_send(struct neighbour
if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
atomic_set(&neigh->probes, neigh->parms->ucast_probes);
neigh->nud_state = NUD_INCOMPLETE;
- neigh_hold(neigh, NDRK_NEIGH_TIMER);
- neigh->timer.expires = now + 1;
- neigh->nud_state |= NUD_IN_TIMER;
- add_timer(&neigh->timer);
+ neigh_add_timer(neigh, now + 1);
} else {
neigh->nud_state = NUD_FAILED;
write_unlock_bh(&neigh->lock);
@@ -877,11 +894,8 @@ int __neigh_event_send(struct neighbour
}
} else if (neigh->nud_state & NUD_STALE) {
NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
- neigh_hold(neigh, NDRK_NEIGH_TIMER);
neigh->nud_state = NUD_DELAY;
- neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
- neigh->nud_state |= NUD_IN_TIMER;
- add_timer(&neigh->timer);
+ neigh_add_timer(neigh, jiffies +
neigh->parms->delay_probe_time);
}
if (neigh->nud_state == NUD_INCOMPLETE) {
@@ -1026,12 +1040,9 @@ int neigh_update(struct neighbour *neigh
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_IN_TIMER) {
- neigh_hold(neigh, NDRK_NEIGH_TIMER);
- neigh->timer.expires = jiffies +
+ neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
- neigh->parms->reachable_time :
0);
- neigh->nud_state |= NUD_IN_TIMER;
- add_timer(&neigh->timer);
+ neigh->parms->reachable_time :
0)));
}
neigh->nud_state = new;
}
--
Ben Greear <[EMAIL PROTECTED]>
Candela Technologies Inc http://www.candelatech.com
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html