On Mon, 2 Nov 1998, Mr M S Aitchison wrote:

> Subject: "eth0: Re-entering the ..." lockup with tulip.c:v0.98H
> 
> We've started getting frequent lockups on a 2.0.35 SMP kernel dual
> Pentium II with the repeated error message:
> 
> eth0: Re-entering the interrupt handler with proc 0, proc 0 already handling.
> 
> The NIC is an SMC 10/100 tulip, and the driver is v0.98H 5/23/98; I
> know this problem was reported with several cards under SMP early this
> year, but I thought it was solved.  I could try changing to an
> epic-based card, or go back to 2.0.34, or disable SMP.

This is NOT a driver bug.  It's a bug in the SMP code.
The re-entry check exists in a few common ethernet drivers because no one
believed that there was a bug, and the crashes were blamed on the drivers.

[[ IMO, network device drivers should not need to know that the machine is
SMP.  When turning off SMP fixes the problem, that likely points to a
problem in the SMP code, not the driver. ]]

This bug likely affects other drivers as well.

The following fix was suggested by Loic Prylli <[EMAIL PROTECTED]>

________________
From: Loic Prylli <[EMAIL PROTECTED]>

I may have found a possible cause, in the Linux interrupt code in
asm-i386/irq.h, correct me if I am wrong:
 The code assumes a processor acquires the global kernel lock before
manipulating the cache_A1 and the cache_21 variables and changing the
interrupt mask register. But this is not respected for the MSGIRQ
handler associated with IPI interrupts, there seems to be a race
condition where the end of the IPI handler can be executed
concurrently with a normal interrupt leading to wrong setting for the
interrupt mask registers, and cache_A1.

The problem will occur more often with fast networks because there are
more frequent interrupts, but I think any SMP configuration could
potentially be affected, even with no network.

 If you had problems before, you can try the patch below, hope it does 
not break anything, there is two parts:
- in arch/i386/irq.c we try to detect if we are in an incoherent
state, for instance without the other change I get this kind of output:

 kernel: IRQ 15 (proc 0):cache_x1=0x72,INT mask=0xd2
 kernel: IRQ 15 (proc 0):cache_x1=0x52,INT mask=0x52
 kernel: IRQ 15 (proc 0):cache_x1=0x52,INT mask=0xf2

As you can see, we can be processing interrupt 15 without the corresponding 
bit set in cache_A1. Sometimes the interrupt mask register bit is set,
sometimes not.  Moreover there are strange things occuring with the
bits associated with the IPI interrupt. 

- in include/asm-i386/irq.h, I changed the IPI
interrupt code, so that it does not touch the interrupt mask register
nor cache_A1.

This patch works well for me, but as I have hardly any knowledge about
the IOAPIC used for IPI, I would appreciate comments from someone more
knowledgeable.

Loic


--foelzrdq+r
Content-Type: TEXT/PLAIN; CHARSET=US-ASCII
Content-ID: <Pine.SUN.3.91.981027120122.11250E@yukon>

--- linux/arch/i386/kernel/irq.c.std    Sat Oct  3 20:22:39 1998
+++ linux/arch/i386/kernel/irq.c        Sat Oct  3 20:36:15 1998
@@ -345,7 +345,25 @@
 {
        struct irqaction * action = *(irq + irq_action);
        int do_random = 0;
-
+       int c,intm,mask;
+       static int count;
+       if (smp_processor_id() != 0 && count++ < 1000)
+         printk("IRQ %d: done by CPU %d\n",irq,smp_processor_id());
+       if (irq  >= 8) {
+         c = cache_A1;
+         intm = inb(0xA1);
+         mask =  1 << (irq - 8);
+       } else {
+         c = cache_21;
+         intm = inb(0x21);
+         mask =  1 << irq;
+       }
+       if (!(c & mask) || !(intm & mask)) {
+         printk("IRQ %d (proc %d):cache_x1=0x%x,INT mask=0x%x\n", irq, 
+smp_processor_id(),c,intm);
+         /* better to return because the interrupt may be asserted again,
+            the bad thing is that we may loose some interrupts */
+         return;
+       }
 #ifdef __SMP__
        if(smp_threads_ready && active_kernel_processor!=smp_processor_id())
                panic("IRQ %d: active processor set wrongly(%d not %d).\n", irq, 
active_kernel_processor, smp_processor_id());
--- linux/include/asm-i386/irq.h.std    Sat Oct  3 20:22:59 1998
+++ linux/include/asm-i386/irq.h        Sat Oct  3 22:35:29 1998
@@ -108,6 +108,17 @@
        "1:\tjmp 1f\n" \
        "1:\toutb %al,$0x20\n\t"
 
+/* do not modify the ISR nor the cache_A1 variable */
+#define MSGACK_SECOND(mask,nr) \
+       "inb $0xA1,%al\n\t" \
+       "jmp 1f\n" \
+       "1:\tjmp 1f\n" \
+       "1:\tmovb $0x20,%al\n\t" \
+       "outb %al,$0xA0\n\t" \
+       "jmp 1f\n" \
+       "1:\tjmp 1f\n" \
+       "1:\toutb %al,$0x20\n\t"
+
 #define UNBLK_FIRST(mask) \
        "inb $0x21,%al\n\t" \
        "jmp 1f\n" \
@@ -302,34 +313,14 @@
 __asm__( \
 "\n"__ALIGN_STR"\n" \
 SYMBOL_NAME_STR(IRQ) #nr "_interrupt:\n\t" \
-       "pushl $-"#nr"-2\n\t" \
-       SAVE_ALL \
-       ENTER_KERNEL \
-       ACK_##chip(mask,(nr&7)) \
-       "incl "SYMBOL_NAME_STR(intr_count)"\n\t"\
-       "sti\n\t" \
-       "movl %esp,%ebx\n\t" \
-       "pushl %ebx\n\t" \
-       "pushl $" #nr "\n\t" \
-       "call "SYMBOL_NAME_STR(do_IRQ)"\n\t" \
-       "addl $8,%esp\n\t" \
-       "cli\n\t" \
-       UNBLK_##chip(mask) \
-       GET_PROCESSOR_ID \
-       "btrl $" STR(SMP_FROM_INT) ","SYMBOL_NAME_STR(smp_proc_in_lock)"(,%eax,4)\n\t" 
\
-       "decl "SYMBOL_NAME_STR(intr_count)"\n\t" \
-       "incl "SYMBOL_NAME_STR(syscall_count)"\n\t" \
-       "jmp ret_from_sys_call\n" \
-"\n"__ALIGN_STR"\n" \
 SYMBOL_NAME_STR(fast_IRQ) #nr "_interrupt:\n\t" \
        SAVE_MOST \
-       ACK_##chip(mask,(nr&7)) \
+       MSGACK_##chip(mask,(nr&7)) \
        SMP_PROF_IPI_CNT \
        "pushl $" #nr "\n\t" \
        "call "SYMBOL_NAME_STR(do_fast_IRQ)"\n\t" \
        "addl $4,%esp\n\t" \
        "cli\n\t" \
-       UNBLK_##chip(mask) \
        RESTORE_MOST \
 "\n"__ALIGN_STR"\n" \
 SYMBOL_NAME_STR(bad_IRQ) #nr "_interrupt:\n\t" \
________________

Donald Becker                                     [EMAIL PROTECTED]
USRA-CESDIS, Center of Excellence in Space Data and Information Sciences.
Code 930.5, Goddard Space Flight Center,  Greenbelt, MD.  20771
301-286-0882         http://cesdis.gsfc.nasa.gov/people/becker/whoiam.html

Reply via email to