Re: [OTP] SMP board recommendations?

2001-02-16 Thread Roeland Th. Jansen

On Thu, Feb 15, 2001 at 04:38:37PM -0800, David D.W. Downey wrote:
> I've tried the Abit VP6 and the MSI 6321 (694D Pro). Both give me the APIC
> errors with system lockups on heavy I/O using the 2.4.1-ac1# and the
> 2.4.2-pre# kernels. (The ac-## line doesn't die ANYWHERE near as often as
> the other board.)



the APIC code has been modified quite a bit and Maciej's fixes so far,
on this part shows that my BP6 stays alive while even the -AC kernels
were killed. I'd suggest you to try his patches and see if that works
for you.

IIRC, this is the one :

patch-2.4.1-io_apic-46
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/apic.c 
linux-2.4.1/arch/i386/kernel/apic.c
--- linux-2.4.1.macro/arch/i386/kernel/apic.c   Wed Dec 13 23:54:27 2000
+++ linux-2.4.1/arch/i386/kernel/apic.c Mon Feb 12 16:11:15 2001
@@ -23,6 +23,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -270,7 +271,13 @@ void __init setup_local_APIC (void)
 *   PCI Ne2000 networking cards and PII/PIII processors, dual
 *   BX chipset. ]
 */
-#if 0
+   /*
+* Actually disabling the focus CPU check just makes the hang less
+* frequent as it makes the interrupt distributon model be more
+* like LRU than MRU (the short-term load is more even across CPUs).
+* See also the comment in end_level_ioapic_irq().  --macro
+*/
+#if 1
/* Enable focus processor (bit==0) */
value &= ~(1<<9);
 #else
@@ -764,7 +771,7 @@ asmlinkage void smp_error_interrupt(void
apic_write(APIC_ESR, 0);
v1 = apic_read(APIC_ESR);
ack_APIC_irq();
-   irq_err_count++;
+   atomic_inc(_err_count);
 
/* Here is what the APIC error bits mean:
   0: Send CS error
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/i8259.c 
linux-2.4.1/arch/i386/kernel/i8259.c
--- linux-2.4.1.macro/arch/i386/kernel/i8259.c  Mon Nov 20 18:01:58 2000
+++ linux-2.4.1/arch/i386/kernel/i8259.cSun Feb 11 19:54:33 2001
@@ -12,6 +12,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -321,7 +322,7 @@ spurious_8259A_irq:
printk("spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
-   irq_err_count++;
+   atomic_inc(_err_count);
/*
 * Theoretically we do not have to handle this IRQ,
 * but in Linux this does not cause problems and is
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/io_apic.c 
linux-2.4.1/arch/i386/kernel/io_apic.c
--- linux-2.4.1.macro/arch/i386/kernel/io_apic.cSat Feb  3 12:05:49 2001
+++ linux-2.4.1/arch/i386/kernel/io_apic.c  Tue Feb 13 19:59:55 2001
@@ -33,6 +33,8 @@
 #include 
 #include 
 
+#define APIC_LOCKUP_DEBUG
+
 static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED;
 
 /*
@@ -122,8 +124,14 @@ static void add_pin_to_irq(unsigned int 
static void name##_IO_APIC_irq (unsigned int irq)   \
__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,0, |= 0x0001, io_apic_sync(entry->apic))/* mask = 1 */
-DO_ACTION( __unmask,  0, &= 0xfffe, )  /* mask = 0 */
+DO_ACTION( __mask, 0, |= 0x0001, io_apic_sync(entry->apic) )
+   /* mask = 1 */
+DO_ACTION( __unmask,   0, &= 0xfffe, )
+   /* mask = 0 */
+DO_ACTION( __mask_and_edge,0, = (reg & 0x7fff) | 0x0001, )
+   /* mask = 1, trigger = 0 */
+DO_ACTION( __unmask_and_level, 0, = (reg & 0xfffe) | 0x8000, )
+   /* mask = 0, trigger = 1 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -847,6 +855,8 @@ void /*__init*/ print_local_APIC(void * 
 
v = apic_read(APIC_EOI);
printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+   v = apic_read(APIC_RRR);
+   printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
v = apic_read(APIC_DFR);
@@ -1191,12 +1201,61 @@ static unsigned int startup_level_ioapic
 #define enable_level_ioapic_irqunmask_IO_APIC_irq
 #define disable_level_ioapic_irq   mask_IO_APIC_irq
 
-static void end_level_ioapic_irq (unsigned int i)
+static void end_level_ioapic_irq (unsigned int irq)
 {
+   unsigned long v;
+
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets).  Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts 

Re: [OTP] SMP board recommendations?

2001-02-16 Thread Roeland Th. Jansen

On Thu, Feb 15, 2001 at 04:38:37PM -0800, David D.W. Downey wrote:
 I've tried the Abit VP6 and the MSI 6321 (694D Pro). Both give me the APIC
 errors with system lockups on heavy I/O using the 2.4.1-ac1# and the
 2.4.2-pre# kernels. (The ac-## line doesn't die ANYWHERE near as often as
 the other board.)



the APIC code has been modified quite a bit and Maciej's fixes so far,
on this part shows that my BP6 stays alive while even the -AC kernels
were killed. I'd suggest you to try his patches and see if that works
for you.

IIRC, this is the one :

patch-2.4.1-io_apic-46
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/apic.c 
linux-2.4.1/arch/i386/kernel/apic.c
--- linux-2.4.1.macro/arch/i386/kernel/apic.c   Wed Dec 13 23:54:27 2000
+++ linux-2.4.1/arch/i386/kernel/apic.c Mon Feb 12 16:11:15 2001
@@ -23,6 +23,7 @@
 #include linux/mc146818rtc.h
 #include linux/kernel_stat.h
 
+#include asm/atomic.h
 #include asm/smp.h
 #include asm/mtrr.h
 #include asm/mpspec.h
@@ -270,7 +271,13 @@ void __init setup_local_APIC (void)
 *   PCI Ne2000 networking cards and PII/PIII processors, dual
 *   BX chipset. ]
 */
-#if 0
+   /*
+* Actually disabling the focus CPU check just makes the hang less
+* frequent as it makes the interrupt distributon model be more
+* like LRU than MRU (the short-term load is more even across CPUs).
+* See also the comment in end_level_ioapic_irq().  --macro
+*/
+#if 1
/* Enable focus processor (bit==0) */
value = ~(19);
 #else
@@ -764,7 +771,7 @@ asmlinkage void smp_error_interrupt(void
apic_write(APIC_ESR, 0);
v1 = apic_read(APIC_ESR);
ack_APIC_irq();
-   irq_err_count++;
+   atomic_inc(irq_err_count);
 
/* Here is what the APIC error bits mean:
   0: Send CS error
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/i8259.c 
linux-2.4.1/arch/i386/kernel/i8259.c
--- linux-2.4.1.macro/arch/i386/kernel/i8259.c  Mon Nov 20 18:01:58 2000
+++ linux-2.4.1/arch/i386/kernel/i8259.cSun Feb 11 19:54:33 2001
@@ -12,6 +12,7 @@
 #include linux/init.h
 #include linux/kernel_stat.h
 
+#include asm/atomic.h
 #include asm/system.h
 #include asm/io.h
 #include asm/irq.h
@@ -321,7 +322,7 @@ spurious_8259A_irq:
printk("spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
-   irq_err_count++;
+   atomic_inc(irq_err_count);
/*
 * Theoretically we do not have to handle this IRQ,
 * but in Linux this does not cause problems and is
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/io_apic.c 
linux-2.4.1/arch/i386/kernel/io_apic.c
--- linux-2.4.1.macro/arch/i386/kernel/io_apic.cSat Feb  3 12:05:49 2001
+++ linux-2.4.1/arch/i386/kernel/io_apic.c  Tue Feb 13 19:59:55 2001
@@ -33,6 +33,8 @@
 #include asm/smp.h
 #include asm/desc.h
 
+#define APIC_LOCKUP_DEBUG
+
 static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED;
 
 /*
@@ -122,8 +124,14 @@ static void add_pin_to_irq(unsigned int 
static void name##_IO_APIC_irq (unsigned int irq)   \
__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,0, |= 0x0001, io_apic_sync(entry-apic))/* mask = 1 */
-DO_ACTION( __unmask,  0, = 0xfffe, )  /* mask = 0 */
+DO_ACTION( __mask, 0, |= 0x0001, io_apic_sync(entry-apic) )
+   /* mask = 1 */
+DO_ACTION( __unmask,   0, = 0xfffe, )
+   /* mask = 0 */
+DO_ACTION( __mask_and_edge,0, = (reg  0x7fff) | 0x0001, )
+   /* mask = 1, trigger = 0 */
+DO_ACTION( __unmask_and_level, 0, = (reg  0xfffe) | 0x8000, )
+   /* mask = 0, trigger = 1 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -847,6 +855,8 @@ void /*__init*/ print_local_APIC(void * 
 
v = apic_read(APIC_EOI);
printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+   v = apic_read(APIC_RRR);
+   printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
v = apic_read(APIC_DFR);
@@ -1191,12 +1201,61 @@ static unsigned int startup_level_ioapic
 #define enable_level_ioapic_irqunmask_IO_APIC_irq
 #define disable_level_ioapic_irq   mask_IO_APIC_irq
 
-static void end_level_ioapic_irq (unsigned int i)
+static void end_level_ioapic_irq (unsigned int irq)
 {
+   unsigned long v;
+
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets).  Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as 

Re: [OTP] SMP board recommendations?

2001-02-15 Thread Andre Hedrick


Hi David,

Just to let you and the rest of the world in on a secret, 'ASL, Inc.' is
the premier ATA server system builder.  Jeff Nguyen is the only person
that I knew two years ago that was a pioneer and I have shared some
information with him before in the past, but here is ATA and it it here to
stay.

Cheers,

Andre Hedrick
Linux ATA Development
ASL Kernel Development
-
ASL, Inc. Toll free: 1-877-ASL-3535
1757 Houret Court Fax: 1-408-941-2071
Milpitas, CA 95035Web: www.aslab.com

*** shameless toys of creation to challenage the GB/$$ *** 
http://www.aslab.com/contents/servers/Sovereign-3400T.html
http://www.aslab.com/contents/servers/Sovereign-3450T.html


On Thu, 15 Feb 2001, David D.W. Downey wrote:

> Thank you all for your response.
> 
> Andre (ASL), thanks for the assist. Laurie and Janine took care of me.
> Asus CUV4X-D mobo with 1GB of buffered ECC RAM. I'm in the process of
> transfering all the hardware to the new board. I'll let you know if this
> new board solves the APIC errors and the random lockups under heavy I/O
> problems.
> 
> I do have one more problem that I just can NOT track down.
> 
> 2.4.1-ac10 kernel on the old Abit VP6 mobo. I'm getting curious errors
> from the 2.4.1, 2.4.1-ac10, and 2.4.2-pre[#] kernels.
> 
> I've been using
> 
> dd if=/dev/zero of=/tmp/testdd.img bs=1024k count=1500
> 
> for testing of I/O on the various boards I have here. Now, the funny part
> is that I get "file size limit exceeded" at around 1.0GB. I was getting
> this under the 2.4.2-pre# kernels so i switched to straight 2.4.1 and got
> the same problem. I switched to the 2.4.1-ac# line and the problem
> disappeared. Guess what? It's baaacckk!
> 
> So, I did a strace of the dd command and got the following from it
> 
> execve("/bin/dd", ["dd", "if=/dev/zero", "of=/tmp/testing.img", "bs=1024k", 
>"count=1500"], [/* 22 vars */]) = 0
> brk(0)  = 0x804e7b8
> open("/etc/ld.so.preload", O_RDONLY)= -1 ENOENT (No such file or directory)
> open("/etc/ld.so.cache", O_RDONLY)  = 3
> fstat(3, {st_mode=S_IFREG|0644, st_size=7852, ...}) = 0
> old_mmap(NULL, 7852, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40015000
> close(3)= 0
> open("/lib/libc.so.6", O_RDONLY)= 3
> fstat(3, {st_mode=S_IFREG|0755, st_size=1183326, ...}) = 0
> read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\200\215"..., 4096) = 4096
> old_mmap(NULL, 947548, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x40017000
> mprotect(0x400f7000, 30044, PROT_NONE)  = 0
> old_mmap(0x400f7000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0xdf000) 
>= 0x400f7000
> old_mmap(0x400fb000, 13660, PROT_READ|PROT_WRITE, 
>MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x400fb000
> close(3)= 0
> old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
>0x400ff000
> mprotect(0x40017000, 917504, PROT_READ|PROT_WRITE) = 0
> mprotect(0x40017000, 917504, PROT_READ|PROT_EXEC) = 0
> munmap(0x40015000, 7852)= 0
> personality(PER_LINUX)  = 0
> getpid()= 195
> brk(0)  = 0x804e7b8
> brk(0x804e7f0)  = 0x804e7f0
> brk(0x804f000)  = 0x804f000
> open("/dev/zero", O_RDONLY|O_LARGEFILE) = 3
> open("/tmp/testing.img", O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 4
> rt_sigaction(SIGINT, NULL, {SIG_DFL}, 8) = 0
> rt_sigaction(SIGINT, {0x804ada8, [], 0x400}, NULL, 8) = 0
> rt_sigaction(SIGQUIT, NULL, {SIG_DFL}, 8) = 0
> rt_sigaction(SIGQUIT, {0x804ada8, [], 0x400}, NULL, 8) = 0
> rt_sigaction(SIGPIPE, NULL, {SIG_DFL}, 8) = 0
> rt_sigaction(SIGPIPE, {0x804ada8, [], 0x400}, NULL, 8) = 0
> rt_sigaction(SIGUSR1, NULL, {SIG_DFL}, 8) = 0
> rt_sigaction(SIGUSR1, {0x804ae70, [], 0x400}, NULL, 8) = 0
> old_mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
>0x4010
> read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
>1048576
> write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
>1048576
> 
> * BIG ASS SNIP **
> 
> read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
>1048576
> write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = -1 
>EFBIG (File too large)
> --- SIGXFSZ (File size limit exceeded) ---
> +++ killed by SIGXFSZ +++
> 
> 
> 
> Now, notice the beginning file creation call. It starts out with
> O_LARGEFILE but ends with EFBIG. Since I'm not totally familiar with the
> kernel code I could be wrong on my next statement and if I am, please tell
> me, but it looks like it changes the file creation call from LARGEFILE to
> EFBIG (or is this just the error call 

Re: [OTP] SMP board recommendations?

2001-02-15 Thread David D.W. Downey

Thank you all for your response.

Andre (ASL), thanks for the assist. Laurie and Janine took care of me.
Asus CUV4X-D mobo with 1GB of buffered ECC RAM. I'm in the process of
transfering all the hardware to the new board. I'll let you know if this
new board solves the APIC errors and the random lockups under heavy I/O
problems.

I do have one more problem that I just can NOT track down.

2.4.1-ac10 kernel on the old Abit VP6 mobo. I'm getting curious errors
from the 2.4.1, 2.4.1-ac10, and 2.4.2-pre[#] kernels.

I've been using

dd if=/dev/zero of=/tmp/testdd.img bs=1024k count=1500

for testing of I/O on the various boards I have here. Now, the funny part
is that I get "file size limit exceeded" at around 1.0GB. I was getting
this under the 2.4.2-pre# kernels so i switched to straight 2.4.1 and got
the same problem. I switched to the 2.4.1-ac# line and the problem
disappeared. Guess what? It's baaacckk!

So, I did a strace of the dd command and got the following from it

execve("/bin/dd", ["dd", "if=/dev/zero", "of=/tmp/testing.img", "bs=1024k", 
"count=1500"], [/* 22 vars */]) = 0
brk(0)  = 0x804e7b8
open("/etc/ld.so.preload", O_RDONLY)= -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)  = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=7852, ...}) = 0
old_mmap(NULL, 7852, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40015000
close(3)= 0
open("/lib/libc.so.6", O_RDONLY)= 3
fstat(3, {st_mode=S_IFREG|0755, st_size=1183326, ...}) = 0
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\200\215"..., 4096) = 4096
old_mmap(NULL, 947548, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x40017000
mprotect(0x400f7000, 30044, PROT_NONE)  = 0
old_mmap(0x400f7000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0xdf000) = 
0x400f7000
old_mmap(0x400fb000, 13660, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, 
-1, 0) = 0x400fb000
close(3)= 0
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x400ff000
mprotect(0x40017000, 917504, PROT_READ|PROT_WRITE) = 0
mprotect(0x40017000, 917504, PROT_READ|PROT_EXEC) = 0
munmap(0x40015000, 7852)= 0
personality(PER_LINUX)  = 0
getpid()= 195
brk(0)  = 0x804e7b8
brk(0x804e7f0)  = 0x804e7f0
brk(0x804f000)  = 0x804f000
open("/dev/zero", O_RDONLY|O_LARGEFILE) = 3
open("/tmp/testing.img", O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 4
rt_sigaction(SIGINT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGINT, {0x804ada8, [], 0x400}, NULL, 8) = 0
rt_sigaction(SIGQUIT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGQUIT, {0x804ada8, [], 0x400}, NULL, 8) = 0
rt_sigaction(SIGPIPE, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGPIPE, {0x804ada8, [], 0x400}, NULL, 8) = 0
rt_sigaction(SIGUSR1, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGUSR1, {0x804ae70, [], 0x400}, NULL, 8) = 0
old_mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x4010
read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
1048576

* BIG ASS SNIP **

read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = -1 
EFBIG (File too large)
--- SIGXFSZ (File size limit exceeded) ---
+++ killed by SIGXFSZ +++



Now, notice the beginning file creation call. It starts out with
O_LARGEFILE but ends with EFBIG. Since I'm not totally familiar with the
kernel code I could be wrong on my next statement and if I am, please tell
me, but it looks like it changes the file creation call from LARGEFILE to
EFBIG (or is this just the error call itself?)

Now, the kernel is supposed to be able to handle creating a 4TB file(?),
so 1.0GB should be nothing to it. NOTHING changed betwen it working and
not working. No hardware changes, no software additions, no recompiles of
existing applications/daemons.. nothing.

So, my question is now one of "What gives?" Any clues on how I can check
to see what's going wrong? Is my gut feeling that it's changing the file
type wrong? (IIUC, there are different open() calls for different size
files? No, I have nothing to base this one, just something I flashed on
and thought might explain the problem.)

I'm learning here guys, so please be gentle. You folks are the only ones I
have with the experience to tell me when I'm just fscked in the head and
when I'm bang on.

-- 
David D.W. Downey - RHCE
Consulting Engineer
Ensim Corporation - Sunnyvale, CA

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at 

Re: [OTP] SMP board recommendations?

2001-02-15 Thread David D.W. Downey

Thank you all for your response.

Andre (ASL), thanks for the assist. Laurie and Janine took care of me.
Asus CUV4X-D mobo with 1GB of buffered ECC RAM. I'm in the process of
transfering all the hardware to the new board. I'll let you know if this
new board solves the APIC errors and the random lockups under heavy I/O
problems.

I do have one more problem that I just can NOT track down.

2.4.1-ac10 kernel on the old Abit VP6 mobo. I'm getting curious errors
from the 2.4.1, 2.4.1-ac10, and 2.4.2-pre[#] kernels.

I've been using

dd if=/dev/zero of=/tmp/testdd.img bs=1024k count=1500

for testing of I/O on the various boards I have here. Now, the funny part
is that I get "file size limit exceeded" at around 1.0GB. I was getting
this under the 2.4.2-pre# kernels so i switched to straight 2.4.1 and got
the same problem. I switched to the 2.4.1-ac# line and the problem
disappeared. Guess what? It's baaacckk!

So, I did a strace of the dd command and got the following from it

execve("/bin/dd", ["dd", "if=/dev/zero", "of=/tmp/testing.img", "bs=1024k", 
"count=1500"], [/* 22 vars */]) = 0
brk(0)  = 0x804e7b8
open("/etc/ld.so.preload", O_RDONLY)= -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)  = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=7852, ...}) = 0
old_mmap(NULL, 7852, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40015000
close(3)= 0
open("/lib/libc.so.6", O_RDONLY)= 3
fstat(3, {st_mode=S_IFREG|0755, st_size=1183326, ...}) = 0
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\200\215"..., 4096) = 4096
old_mmap(NULL, 947548, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x40017000
mprotect(0x400f7000, 30044, PROT_NONE)  = 0
old_mmap(0x400f7000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0xdf000) = 
0x400f7000
old_mmap(0x400fb000, 13660, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, 
-1, 0) = 0x400fb000
close(3)= 0
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x400ff000
mprotect(0x40017000, 917504, PROT_READ|PROT_WRITE) = 0
mprotect(0x40017000, 917504, PROT_READ|PROT_EXEC) = 0
munmap(0x40015000, 7852)= 0
personality(PER_LINUX)  = 0
getpid()= 195
brk(0)  = 0x804e7b8
brk(0x804e7f0)  = 0x804e7f0
brk(0x804f000)  = 0x804f000
open("/dev/zero", O_RDONLY|O_LARGEFILE) = 3
open("/tmp/testing.img", O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 4
rt_sigaction(SIGINT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGINT, {0x804ada8, [], 0x400}, NULL, 8) = 0
rt_sigaction(SIGQUIT, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGQUIT, {0x804ada8, [], 0x400}, NULL, 8) = 0
rt_sigaction(SIGPIPE, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGPIPE, {0x804ada8, [], 0x400}, NULL, 8) = 0
rt_sigaction(SIGUSR1, NULL, {SIG_DFL}, 8) = 0
rt_sigaction(SIGUSR1, {0x804ae70, [], 0x400}, NULL, 8) = 0
old_mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x4010
read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
1048576

* BIG ASS SNIP **

read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = -1 
EFBIG (File too large)
--- SIGXFSZ (File size limit exceeded) ---
+++ killed by SIGXFSZ +++



Now, notice the beginning file creation call. It starts out with
O_LARGEFILE but ends with EFBIG. Since I'm not totally familiar with the
kernel code I could be wrong on my next statement and if I am, please tell
me, but it looks like it changes the file creation call from LARGEFILE to
EFBIG (or is this just the error call itself?)

Now, the kernel is supposed to be able to handle creating a 4TB file(?),
so 1.0GB should be nothing to it. NOTHING changed betwen it working and
not working. No hardware changes, no software additions, no recompiles of
existing applications/daemons.. nothing.

So, my question is now one of "What gives?" Any clues on how I can check
to see what's going wrong? Is my gut feeling that it's changing the file
type wrong? (IIUC, there are different open() calls for different size
files? No, I have nothing to base this one, just something I flashed on
and thought might explain the problem.)

I'm learning here guys, so please be gentle. You folks are the only ones I
have with the experience to tell me when I'm just fscked in the head and
when I'm bang on.

-- 
David D.W. Downey - RHCE
Consulting Engineer
Ensim Corporation - Sunnyvale, CA

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at 

Re: [OTP] SMP board recommendations?

2001-02-15 Thread Andre Hedrick


Hi David,

Just to let you and the rest of the world in on a secret, 'ASL, Inc.' is
the premier ATA server system builder.  Jeff Nguyen is the only person
that I knew two years ago that was a pioneer and I have shared some
information with him before in the past, but here is ATA and it it here to
stay.

Cheers,

Andre Hedrick
Linux ATA Development
ASL Kernel Development
-
ASL, Inc. Toll free: 1-877-ASL-3535
1757 Houret Court Fax: 1-408-941-2071
Milpitas, CA 95035Web: www.aslab.com

*** shameless toys of creation to challenage the GB/$$ *** 
http://www.aslab.com/contents/servers/Sovereign-3400T.html
http://www.aslab.com/contents/servers/Sovereign-3450T.html


On Thu, 15 Feb 2001, David D.W. Downey wrote:

 Thank you all for your response.
 
 Andre (ASL), thanks for the assist. Laurie and Janine took care of me.
 Asus CUV4X-D mobo with 1GB of buffered ECC RAM. I'm in the process of
 transfering all the hardware to the new board. I'll let you know if this
 new board solves the APIC errors and the random lockups under heavy I/O
 problems.
 
 I do have one more problem that I just can NOT track down.
 
 2.4.1-ac10 kernel on the old Abit VP6 mobo. I'm getting curious errors
 from the 2.4.1, 2.4.1-ac10, and 2.4.2-pre[#] kernels.
 
 I've been using
 
 dd if=/dev/zero of=/tmp/testdd.img bs=1024k count=1500
 
 for testing of I/O on the various boards I have here. Now, the funny part
 is that I get "file size limit exceeded" at around 1.0GB. I was getting
 this under the 2.4.2-pre# kernels so i switched to straight 2.4.1 and got
 the same problem. I switched to the 2.4.1-ac# line and the problem
 disappeared. Guess what? It's baaacckk!
 
 So, I did a strace of the dd command and got the following from it
 
 execve("/bin/dd", ["dd", "if=/dev/zero", "of=/tmp/testing.img", "bs=1024k", 
"count=1500"], [/* 22 vars */]) = 0
 brk(0)  = 0x804e7b8
 open("/etc/ld.so.preload", O_RDONLY)= -1 ENOENT (No such file or directory)
 open("/etc/ld.so.cache", O_RDONLY)  = 3
 fstat(3, {st_mode=S_IFREG|0644, st_size=7852, ...}) = 0
 old_mmap(NULL, 7852, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40015000
 close(3)= 0
 open("/lib/libc.so.6", O_RDONLY)= 3
 fstat(3, {st_mode=S_IFREG|0755, st_size=1183326, ...}) = 0
 read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\200\215"..., 4096) = 4096
 old_mmap(NULL, 947548, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x40017000
 mprotect(0x400f7000, 30044, PROT_NONE)  = 0
 old_mmap(0x400f7000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3, 0xdf000) 
= 0x400f7000
 old_mmap(0x400fb000, 13660, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x400fb000
 close(3)= 0
 old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x400ff000
 mprotect(0x40017000, 917504, PROT_READ|PROT_WRITE) = 0
 mprotect(0x40017000, 917504, PROT_READ|PROT_EXEC) = 0
 munmap(0x40015000, 7852)= 0
 personality(PER_LINUX)  = 0
 getpid()= 195
 brk(0)  = 0x804e7b8
 brk(0x804e7f0)  = 0x804e7f0
 brk(0x804f000)  = 0x804f000
 open("/dev/zero", O_RDONLY|O_LARGEFILE) = 3
 open("/tmp/testing.img", O_RDWR|O_CREAT|O_TRUNC|O_LARGEFILE, 0666) = 4
 rt_sigaction(SIGINT, NULL, {SIG_DFL}, 8) = 0
 rt_sigaction(SIGINT, {0x804ada8, [], 0x400}, NULL, 8) = 0
 rt_sigaction(SIGQUIT, NULL, {SIG_DFL}, 8) = 0
 rt_sigaction(SIGQUIT, {0x804ada8, [], 0x400}, NULL, 8) = 0
 rt_sigaction(SIGPIPE, NULL, {SIG_DFL}, 8) = 0
 rt_sigaction(SIGPIPE, {0x804ada8, [], 0x400}, NULL, 8) = 0
 rt_sigaction(SIGUSR1, NULL, {SIG_DFL}, 8) = 0
 rt_sigaction(SIGUSR1, {0x804ae70, [], 0x400}, NULL, 8) = 0
 old_mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 
0x4010
 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
1048576
 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
1048576
 
 * BIG ASS SNIP **
 
 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 
1048576
 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = -1 
EFBIG (File too large)
 --- SIGXFSZ (File size limit exceeded) ---
 +++ killed by SIGXFSZ +++
 
 
 
 Now, notice the beginning file creation call. It starts out with
 O_LARGEFILE but ends with EFBIG. Since I'm not totally familiar with the
 kernel code I could be wrong on my next statement and if I am, please tell
 me, but it looks like it changes the file creation call from LARGEFILE to
 EFBIG (or is this just the error call itself?)
 
 Now, the kernel is supposed to be able to handle creating a 4TB file(?),
 so