Bug#536644: initramfs-tools: Boot failure from software-RAID1 with debian Lenny

2009-07-12 Thread martin f krafft
also sprach D. North  [2009.07.13.0023 +0200]:
> > Do you have multipaths?
> 
> The multipath tools are installed -- but I don't think they're required. The
> selected packages for this system came from a package list on an etch system
> which this machine is replacing.

I have to ask: does the problem persist if you deinstall the
package (multipath-tools-boot)?

> I can certainly do that... however, enabling 'debug' on the kernel boot line
> affects the timing enough that the problem doesn't occur any more. Do you 
> still
> want the debug log?

I love Heisenbugs. Yes, I think you should still include it.

-- 
 .''`.   martin f. krafft   Related projects:
: :'  :  proud Debian developer   http://debiansystem.info
`. `'`   http://people.debian.org/~madduckhttp://vcs-pkg.org
  `-  Debian - when you have better things to do than fixing systems


digital_signature_gpg.asc
Description: Digital signature (see http://martin-krafft.net/gpg/)


Bug#536644: initramfs-tools: Boot failure from software-RAID1 with debian Lenny

2009-07-12 Thread D. North
Thus spake martin f krafft (madd...@debian.org):

> also sprach D. North  [2009.07.12.0535 +0200]:
> > Discovering multipaths...
> 
> Do you have multipaths?

The multipath tools are installed -- but I don't think they're required. The
selected packages for this system came from a package list on an etch system
which this machine is replacing.
 
> Why are you not using mdadm for multipaths?

Per the above, multipaths should not be required for this configuration.
Additionally, I didn't make a concious selection of one over the other,
so unfortunately I can't give you a helpful answer to that. -- Should
I simply attempt to remove the multipaths-related packages?

> Do you have the dmraid package installed?

No.

> Can you please check out http://wiki.debian.org/InitramfsDebug at
> the bottom and make sure to submit the trace log of a failed boot
> with the debug kernel option?

I can certainly do that... however, enabling 'debug' on the kernel boot line
affects the timing enough that the problem doesn't occur any more. Do you still
want the debug log?

-
Thank you for the information on HOMEHOST ... now the last half of that UUID
being the same makes sense.

I have copied the requested output from /usr/share/bug/mdadm/script
below -- I will try to get to the 'strace' testing within the
next day... The segfault in mdadm does not occur every time, so it
may be a bit harder to catch.

phoenix:~# /usr/share/bug/mdadm/script 3>&1
--- mount output
/dev/md1 on / type ext3 (rw,errors=remount-ro)
tmpfs on /lib/init/rw type tmpfs (rw,nosuid,mode=0755)
proc on /proc type proc (rw,noexec,nosuid,nodev)
sysfs on /sys type sysfs (rw,noexec,nosuid,nodev)
procbususb on /proc/bus/usb type usbfs (rw)
udev on /dev type tmpfs (rw,mode=0755)
tmpfs on /dev/shm type tmpfs (rw,nosuid,nodev)
devpts on /dev/pts type devpts (rw,noexec,nosuid,gid=5,mode=620)
/dev/md0 on /boot type ext3 (rw)

--- mdadm.conf
# mdadm.conf
#
# Please refer to mdadm.conf(5) for information about this file.
#

# by default, scan all partitions (/proc/partitions) for MD superblocks.
# alternatively, specify devices to scan, using wildcards if desired.
DEVICE partitions

# auto-create devices with Debian standard permissions
CREATE owner=root group=disk mode=0660 auto=yes

# automatically tag new arrays as belonging to the local system
HOMEHOST 

# instruct the monitoring daemon where to send mail alerts
MAILADDR root

# definitions of existing MD arrays
# ARRAY /dev/md0 level=raid1 num-devices=2 
UUID=abdd9eb3:faeb7c80:e30e8841:87878c43
ARRAY /dev/md0 level=raid1 num-devices=2 
UUID=abdd9eb3:faeb7c80:34b6d411:a56b552d
ARRAY /dev/md1 level=raid1 num-devices=2 
UUID=8d97d0a5:41763dfc:34b6d411:a56b552d

# This file was auto-generated on Thu, 02 Jul 2009 20:38:29 -0500
# by mkconf $Id$

--- /proc/mdstat:
Personalities : [raid1] 
md1 : active raid1 dm-3[0] dm-5[1]
  29302464 blocks [2/2] [UU]
  
md0 : active raid1 dm-2[0] dm-4[1]
  256896 blocks [2/2] [UU]
  
unused devices: 

--- /proc/partitions:
major minor  #blocks  name

   8 0 1465138584 sda
   8 1 257008 sda1
   8 2   29302560 sda2
   816 1465138584 sdb
   817 257008 sdb1
   818   29302560 sdb2
 254 0 1465138584 dm-0
 254 1 1465138584 dm-1
 254 2 257008 dm-2
 254 3   29302560 dm-3
 254 4 257008 dm-4
 254 5   29302560 dm-5
   9 0 256896 md0
   9 1   29302464 md1

--- initrd.img-2.6.26-2-686:
34790 blocks
8201107a7ff6f5dd640a0ed1f1b4112d  ./etc/mdadm/mdadm.conf
ea9abd44166c288560f8c9789cb3949d  ./sbin/mdadm
6992557752f284596c23923749a4a49d  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-multipath.ko
d8eab1e682b691f6f1721688d3d0dd80  
./lib/modules/2.6.26-2-686/kernel/drivers/md/raid456.ko
08540cbe514ace087bf5212a2e99ecb8  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-log.ko
3b44f0bc10916fb24b18f66e829ebd9d  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-mirror.ko
548a79e85ca844b52e0b91d714ea164a  
./lib/modules/2.6.26-2-686/kernel/drivers/md/raid10.ko
f3770f5935467286a47059e880efdef0  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-round-robin.ko
705b7ddf811fdd5bbd3b00a226cd75e5  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-emc.ko
358d290765df0456f4dd0de43e8f0137  
./lib/modules/2.6.26-2-686/kernel/drivers/md/linear.ko
16fab000929d0c7d4a0bb108dc97542e  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-mod.ko
c115ac530f79c6850886efbea2c6cb78  
./lib/modules/2.6.26-2-686/kernel/drivers/md/raid0.ko
f195929c5d8f2a3bb6aa0c87e6603e49  
./lib/modules/2.6.26-2-686/kernel/drivers/md/raid1.ko
893a16924cb7e15e82cd4224f081d945  
./lib/modules/2.6.26-2-686/kernel/drivers/md/multipath.ko
f645b1d4cd9945d255893c6759222f5e  
./lib/modules/2.6.26-2-686/kernel/drivers/md/md-mod.ko
22d3ea716544bdba4fb3c24e9fe3f7b2  
./lib/modules/2.6.26-2-686/kernel/drivers/md/dm-snapshot.ko
e1e2d0e985196fecaf41fb42e9968af2  ./scripts/local-top/mdadm

--- /proc/modules:
dm_mirror 15104 0 - Live 0xf894e000
dm_log 8484 1 dm_mirror

Bug#536644: initramfs-tools: Boot failure from software-RAID1 with debian Lenny

2009-07-12 Thread martin f krafft
also sprach D. North  [2009.07.12.0535 +0200]:
> Discovering multipaths...

Do you have multipaths?

Why are you not using mdadm for multipaths?

Do you have the dmraid package installed?

Can you please check out http://wiki.debian.org/InitramfsDebug at
the bottom and make sure to submit the trace log of a failed boot
with the debug kernel option?

-- 
 .''`.   martin f. krafft   Related projects:
: :'  :  proud Debian developer   http://debiansystem.info
`. `'`   http://people.debian.org/~madduckhttp://vcs-pkg.org
  `-  Debian - when you have better things to do than fixing systems


digital_signature_gpg.asc
Description: Digital signature (see http://martin-krafft.net/gpg/)


Bug#536644: initramfs-tools: Boot failure from software-RAID1 with debian Lenny

2009-07-12 Thread D. North
Thus spake martin f krafft (madd...@d.o):

> also sprach D. North  [2009.07.12.0535 +0200]:
> > mdadm: no devices found for /dev/md0
> > Segmentation fault
>
> I suggest you run a hardware and memory check.

Thank you for your reply

Following your suggestion, I ran several memtest86 passes with no errors, a cpu
stress test from UBCD, and I have done a lot of bash-level dd & concurrent
multiple-file copies with no errors showing up in /var/log/messages or
/var/log/dmesg -- unfortunately, I really don't know how to stress the machine
further looking for errors -- any suggestions would be most appreciated.

I have also reseated the sata cables & moved them from hardware sockets for
'sata1/sata3' into sockets for 'sata0/sata1' (meaning the drives still
enumerate ata1/ata2), and that 'timing' problem seen in the initial report
('ata2: ACPI get timing mode failed (AE 0x300d)') seems to be gone, but the
boot failures continue to frequently occur without the sleep calls I put in
there as described in the initial report.

Also - somewhere along the line today, the UUIDs of the /dev/md0 members have
changed. I am not aware of anything I did to change them, and I do not know
exactly WHEN they changed. It concerns me quite a bit that the lower half of
the uuid's now match /dev/md1. Here are my updated and again-working array defs
from /etc/mdadm/mdadm.conf: (the commented array def WAS working prior to
today's re-testing)

  # definitions of existing MD arrays
  # ARRAY /dev/md0 level=raid1 num-devices=2 
UUID=abdd9eb3:faeb7c80:e30e8841:87878c43
  ARRAY /dev/md0 level=raid1 num-devices=2 
UUID=abdd9eb3:faeb7c80:34b6d411:a56b552d
  ARRAY /dev/md1 level=raid1 num-devices=2 
UUID=8d97d0a5:41763dfc:34b6d411:a56b552d

I can do a fair amount of diagnostics here, but I sure could use some pointers
as to what to do, and with what tools.

Thanks.




-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org



Bug#536644: initramfs-tools: Boot failure from software-RAID1 with debian Lenny

2009-07-12 Thread martin f krafft
also sprach D. North  [2009.07.12.0535 +0200]:
> mdadm: no devices found for /dev/md0
> Segmentation fault

I suggest you run a hardware and memory check.

-- 
 .''`.   martin f. krafft   Related projects:
: :'  :  proud Debian developer   http://debiansystem.info
`. `'`   http://people.debian.org/~madduckhttp://vcs-pkg.org
  `-  Debian - when you have better things to do than fixing systems


digital_signature_gpg.asc
Description: Digital signature (see http://martin-krafft.net/gpg/)


Bug#536644: initramfs-tools: Boot failure from software-RAID1 with debian Lenny

2009-07-11 Thread D. North
Subject: initramfs-tools: Boot failure from software-RAID1 with debian Lenny
Package: initramfs-tools
Version: 0.92o
Severity: critical
Justification: breaks the whole system

*** Please type your report below this line ***

On a newly built system, the boot sequence has been frequently failing as 
follows:

-
initrd  /initrd.img-2.6.26-2-686
  [Linux-initrd @ 0x378d1000, 0x71ee7b bytes] 


Decompressing Linux... Parsing ELF... done.
Booting the kernel.
[0.420026] PCI: Not using MMCONFIG.
Loading, please wait...
[5.494772] ata2: ACPI get timing mode failed (AE 0x300d)
Discovering multipaths...
mdadm: no devices found for /dev/md0
Segmentation fault
mount: mounting /dev/md1 on /root failed: No such device
mount: mounting /dev on /root/dev failed: No such file or directory
mount: mounting /sys on /root/sys failed: No such file or directory
mount: mounting /proc on /root/proc failed: No such file or directory
Target filesystem doesn't have /sbin/init.
No init found. Try passing init= bootarg. 


BusyBox v1.10.2 (Debian 1:1.10.2-2) built-in shell (ash)
:
:
(initramfs) mdadm --assemble --scan
mdadm: no devices found for /dev/md0
mdadm: /dev/md1 has been started with 1 drive (out of 2)
(initramfs)  
-

Note that the mdadm --assemble --scan works from within the initramfs.

The above sample is only one mode of failure. Sometimes, the system will come
up, but with reduced raidsets. Only once in a while does it come up OK. This is
seemingly a timing problem -- the most frequent failure mode is for md1 (the
root device) to assemble with one member and md0 to fail to assemble at all.

If I add 'debug' to the boot parameters, it introduces enough delay to come up
correctly almost every time. -- The rootdelay parameter does nothing to affect
this problem -- settings between 5 and 30 have been tried with no results.

I did find a workaround though: I have adjusted my local system to get a
reliable boot by editing /usr/share/initramfs-tools/scripts/mdadm and
/usr/share/initramfs-tools/scripts/multipath like this:

1) mdadm: right after 'maybe_break pre-mdadm'
   add '/bin/sleep 1'

2) mdadm: right after 'echo 1 > /sys/module/md_mod/parameters/start_ro'
   add '/bin/sleep 1'

3) multipath: after the last modprobe
   add '/bin/sleep 1'

4) multipath: After the call to /sbin/multipath'
   add '/bin/sleep 1'

Obviously not a good, nor a proposed solution, but perhaps a place to
start in identifying the timing problem. From here, it looks like the
mdadm --assemble --scan is fired off before device mapper has finished
stabilizing things.

For reference, my hardware configuration is:
  Intel motherboard DG41RQ
  Core2 quad 6700
  2xST31500341AS (seagate 1.5T)

-- Package-specific info:
-- /proc/cmdline
root=/dev/md1 rootdelay=5 ro quiet

-- /proc/filesystems
ext3

-- lsmod
Module  Size  Used by
i915   25280  0 
drm65192  1 i915
kvm_intel  31168  0 
kvm   106492  1 kvm_intel
ppdev   6500  0 
parport_pc 22500  0 
lp  8164  0 
parport30988  3 ppdev,parport_pc,lp
ipv6  235364  47 
loop   12748  0 
snd_hda_intel 325688  0 
rng_core3940  0 
i2c_i8017920  0 
video  16432  0 
i2c_core   19828  1 i2c_i801
button  6096  0 
output  2912  1 video
snd_pcm_oss32832  0 
snd_mixer_oss  12320  1 snd_pcm_oss
intel_agp  22556  1 
agpgart28776  3 drm,intel_agp
iTCO_wdt9508  0 
snd_pcm62596  2 snd_hda_intel,snd_pcm_oss
snd_seq_dummy   2660  0 
snd_seq_oss24992  0 
snd_seq_midi5728  0 
snd_rawmidi18528  1 snd_seq_midi
snd_seq_midi_event  6432  2 snd_seq_oss,snd_seq_midi
snd_seq41456  6 
snd_seq_dummy,snd_seq_oss,snd_seq_midi,snd_seq_midi_event
snd_timer  17800  2 snd_pcm,snd_seq
snd_seq_device  6380  5 
snd_seq_dummy,snd_seq_oss,snd_seq_midi,snd_rawmidi,snd_seq
snd45604  9 
snd_hda_intel,snd_pcm_oss,snd_mixer_oss,snd_pcm,snd_seq_oss,snd_rawmidi,snd_seq,snd_timer,snd_seq_device
pcspkr  2432  0 
soundcore   6368  1 snd
snd_page_alloc  7816  2 snd_hda_intel,snd_pcm
evdev   8000  4 
ext3  105512  2 
jbd39444  1 ext3
mbcache 7108  1 ext3
dm_mirror  15104  0 
dm_log  8484  1 dm_mirror
dm_snapshot14340  0 
raid1  18016  2 
md_mod 67068  3 raid1
dm_round_robin  2656  1 
dm_emc  4384  0 
dm_multipath   14920  3 dm_round_robin,dm_emc
dm_mod 46184  18 dm_mirror,dm_log,dm_snapshot,dm_multipath
ide_cd_mod 27652  0 
cdrom  30176  1 ide_cd_mo