2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread Justin Piszcz

# ps auxww | grep D
USER   PID %CPU %MEMVSZ   RSS TTY  STAT START   TIME COMMAND
root   273  0.0  0.0  0 0 ?DOct21  14:40 [pdflush]
root   274  0.0  0.0  0 0 ?DOct21  13:00 [pdflush]

After several days/weeks, this is the second time this has happened, while 
doing regular file I/O (decompressing a file), everything on the device 
went into D-state.


# mdadm -D /dev/md3
/dev/md3:
Version : 00.90.03
  Creation Time : Wed Aug 22 10:38:53 2007
 Raid Level : raid5
 Array Size : 1318680576 (1257.59 GiB 1350.33 GB)
  Used Dev Size : 146520064 (139.73 GiB 150.04 GB)
   Raid Devices : 10
  Total Devices : 10
Preferred Minor : 3
Persistence : Superblock is persistent

Update Time : Sun Nov  4 06:38:29 2007
  State : active
 Active Devices : 10
Working Devices : 10
 Failed Devices : 0
  Spare Devices : 0

 Layout : left-symmetric
 Chunk Size : 1024K

   UUID : e37a12d1:1b0b989a:083fb634:68e9eb49
 Events : 0.4309

Number   Major   Minor   RaidDevice State
   0   8   330  active sync   /dev/sdc1
   1   8   491  active sync   /dev/sdd1
   2   8   652  active sync   /dev/sde1
   3   8   813  active sync   /dev/sdf1
   4   8   974  active sync   /dev/sdg1
   5   8  1135  active sync   /dev/sdh1
   6   8  1296  active sync   /dev/sdi1
   7   8  1457  active sync   /dev/sdj1
   8   8  1618  active sync   /dev/sdk1
   9   8  1779  active sync   /dev/sdl1

If I wanted to find out what is causing this, what type of debugging would 
I have to enable to track it down?  Any attempt to read/write files on the 
devices fails (also going into d-state).  Is there any useful information 
I can get currently before rebooting the machine?


# pwd
/sys/block/md3/md
# ls
array_state  dev-sdj1/ rd2@  stripe_cache_active
bitmap_set_bits  dev-sdk1/ rd3@  stripe_cache_size
chunk_size   dev-sdl1/ rd4@  suspend_hi
component_size   layoutrd5@  suspend_lo
dev-sdc1/level rd6@  sync_action
dev-sdd1/metadata_version  rd7@  sync_completed
dev-sde1/mismatch_cnt  rd8@  sync_speed
dev-sdf1/new_dev   rd9@  sync_speed_max
dev-sdg1/raid_disksreshape_position  sync_speed_min
dev-sdh1/rd0@  resync_start
dev-sdi1/rd1@  safe_mode_delay
# cat array_state
active-idle
# cat mismatch_cnt
0
# cat stripe_cache_active
1
# cat stripe_cache_size
16384
# cat sync_action
idle
# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4]
md1 : active raid1 sdb2[1] sda2[0]
  136448 blocks [2/2] [UU]

md2 : active raid1 sdb3[1] sda3[0]
  129596288 blocks [2/2] [UU]

md3 : active raid5 sdl1[9] sdk1[8] sdj1[7] sdi1[6] sdh1[5] sdg1[4] sdf1[3] 
sde1[2] sdd1[1] sdc1[0]
  1318680576 blocks level 5, 1024k chunk, algorithm 2 [10/10] 
[UU]


md0 : active raid1 sdb1[1] sda1[0]
  16787776 blocks [2/2] [UU]

unused devices: none
#

Justin.
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1: mdadm/raid5 hung/d-state (md3_raid5 stuck in endless loop?)

2007-11-04 Thread Justin Piszcz

Time to reboot, before reboot:

top - 07:30:23 up 13 days, 13:33, 10 users,  load average: 16.00, 15.99, 14.96
Tasks: 221 total,   7 running, 209 sleeping,   0 stopped,   5 zombie
Cpu(s):  0.0%us, 25.5%sy,  0.0%ni, 74.5%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Mem:   8039432k total,  1744356k used,  6295076k free,  164k buffers
Swap: 16787768k total,  160k used, 16787608k free,   616960k cached

  PID USER  PR  NI  VIRT  RES  SHR S %CPU %MEMTIME+  COMMAND
  688 root  15  -5 000 R  100  0.0 121:21.43 md3_raid5
  273 root  20   0 000 D0  0.0  14:40.68 pdflush
  274 root  20   0 000 D0  0.0  13:00.93 pdflush

# cat /proc/fs/xfs/stat
extent_alloc 301974 256068291 310513 240764389
abt 1900173 15346352 738568 731314
blk_map 276979807 235589732 864002 211245834 591619 513439614 0
bmbt 50717 367726 14177 11846
dir 3818065 361561 359723 975628
trans 48452 2648064 570998
ig 6034530 2074424 43153 3960106 0 3869384 460831
log 282781 10454333 3028 399803 173488
push_ail 3267594 0 1620 2611 730365 0 4476 0 10269 0
xstrat 291940 0
rw 61423078 103732605
attr 0 0 0 0
icluster 312958 97323 419837
vnodes 90721 4019823 0 1926744 3929102 3929102 3929102 0
buf 14678900 11027087 3651843 25743 760449 0 0 15775888 280425
xpc 966925905920 1047628533165 1162276949815
debug 0

# cat meminfo
MemTotal:  8039432 kB
MemFree:   6287000 kB
Buffers:   164 kB
Cached: 617072 kB
SwapCached:  0 kB
Active: 178404 kB
Inactive:   589880 kB
SwapTotal:16787768 kB
SwapFree: 16787608 kB
Dirty:  494280 kB
Writeback:   86004 kB
AnonPages:  151240 kB
Mapped:  17092 kB
Slab:   259696 kB
SReclaimable:   170876 kB
SUnreclaim:  88820 kB
PageTables:  11448 kB
NFS_Unstable:0 kB
Bounce:  0 kB
CommitLimit:  20807484 kB
Committed_AS:   353536 kB
VmallocTotal: 34359738367 kB
VmallocUsed: 15468 kB
VmallocChunk: 34359722699 kB

# echo 3  /proc/sys/vm/drop_caches

# cat /proc/meminfo
MemTotal:  8039432 kB
MemFree:   6418352 kB
Buffers:32 kB
Cached: 597908 kB
SwapCached:  0 kB
Active: 172028 kB
Inactive:   579808 kB
SwapTotal:16787768 kB
SwapFree: 16787608 kB
Dirty:  494312 kB
Writeback:   86004 kB
AnonPages:  154104 kB
Mapped:  17416 kB
Slab:   144072 kB
SReclaimable:53100 kB
SUnreclaim:  90972 kB
PageTables:  11832 kB
NFS_Unstable:0 kB
Bounce:  0 kB
CommitLimit:  20807484 kB
Committed_AS:   360748 kB
VmallocTotal: 34359738367 kB
VmallocUsed: 15468 kB
VmallocChunk: 34359722699 kB

Nothing is actually happening on the device itself however.

Device: rrqm/s   wrqm/s r/s w/srkB/swkB/s avgrq-sz 
avgqu-sz   await  svctm  %util
sda   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdb   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdc   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdd   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sde   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdf   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdg   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdh   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdi   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdj   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdk   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
sdl   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
md0   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
md3   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
md2   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00
md1   0.00 0.000.000.00 0.00 0.00 0.00 
0.000.00   0.00   0.00


# vmstat 1
procs ---memory-- ---swap-- -io -system-- 
cpu

 r  b   swpd   free   buff  cache   si   sobibo   in   cs us sy id wa
 6  0160 6420244 32 60009200   221   22751  1  1 98  0
 6  0160 6420228 32 60012000 0 0 1015  142  0 25 75  0
 6  0160 6420228 32 60012000 0 0 1005  127  0 25 75  0
 6  0160 6420228 32 60012000 041 1022  151  0 26 74  0
 6  0160 6420228   

Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread Michael Tokarev
Justin Piszcz wrote:
 # ps auxww | grep D
 USER   PID %CPU %MEMVSZ   RSS TTY  STAT START   TIME COMMAND
 root   273  0.0  0.0  0 0 ?DOct21  14:40 [pdflush]
 root   274  0.0  0.0  0 0 ?DOct21  13:00 [pdflush]
 
 After several days/weeks, this is the second time this has happened,
 while doing regular file I/O (decompressing a file), everything on the
 device went into D-state.

The next time you come across something like that, do a SysRq-T dump and
post that.  It shows a stack trace of all processes - and in particular,
where exactly each task is stuck.

/mjt
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread BERTRAND Joël

Justin Piszcz wrote:

# ps auxww | grep D
USER   PID %CPU %MEMVSZ   RSS TTY  STAT START   TIME COMMAND
root   273  0.0  0.0  0 0 ?DOct21  14:40 [pdflush]
root   274  0.0  0.0  0 0 ?DOct21  13:00 [pdflush]

After several days/weeks, this is the second time this has happened, 
while doing regular file I/O (decompressing a file), everything on the 
device went into D-state.


	Same observation here (kernel 2.6.23). I can see this bug when I try to 
synchronize a raid1 volume over iSCSI (each element is a raid5 volume), 
or sometimes only with a 1,5 TB raid5 volume. When this bug occurs, md 
subsystem eats 100% of one CPU and pdflush remains in D state too. What 
is your architecture ? I use two 32-threads T1000 (sparc64), and I'm 
trying to determine if this bug is arch specific.


Regards,

JKB
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread Justin Piszcz



On Sun, 4 Nov 2007, BERTRAND Joël wrote:


Justin Piszcz wrote:

# ps auxww | grep D
USER   PID %CPU %MEMVSZ   RSS TTY  STAT START   TIME COMMAND
root   273  0.0  0.0  0 0 ?DOct21  14:40 [pdflush]
root   274  0.0  0.0  0 0 ?DOct21  13:00 [pdflush]

After several days/weeks, this is the second time this has happened, while 
doing regular file I/O (decompressing a file), everything on the device 
went into D-state.


	Same observation here (kernel 2.6.23). I can see this bug when I try 
to synchronize a raid1 volume over iSCSI (each element is a raid5 volume), or 
sometimes only with a 1,5 TB raid5 volume. When this bug occurs, md subsystem 
eats 100% of one CPU and pdflush remains in D state too. What is your 
architecture ? I use two 32-threads T1000 (sparc64), and I'm trying to 
determine if this bug is arch specific.


Regards,

JKB



Using x86_64 here (Q6600/Intel DG965WH).

Justin.

Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread Michael Tokarev
Justin Piszcz wrote:
 On Sun, 4 Nov 2007, Michael Tokarev wrote:
[]
 The next time you come across something like that, do a SysRq-T dump and
 post that.  It shows a stack trace of all processes - and in particular,
 where exactly each task is stuck.

 Yes I got it before I rebooted, ran that and then dmesg  file.
 
 Here it is:
 
 [1172609.665902]  80747dc0 80747dc0 80747dc0 
 80744d80
 [1172609.668768]  80747dc0 81015c3aa918 810091c899b4 
 810091c899a8

That's only partial list.  All the kernel threads - which are most important
in this context - aren't shown.  You ran out of dmesg buffer, and the most
interesting entries was at the beginning.  If your /var/log partition is
working, the stuff should be in /var/log/kern.log or equivalent.  If it's
not working, there is a way to capture the info still, by stopping syslogd,
cat'ing /proc/kmsg to some tmpfs file and scp'ing it elsewhere.

/mjt
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


telling mdadm to use spare drive.

2007-11-04 Thread Janek Kozicki
Hi,

I finished copying all data from old disc hdc to my shiny new
RAID5 array (/dev/hda3 /dev/sda3 missing). Next step is to create a
partition on hdc and add it to the array. And so I did this:

# mdadm --add /dev/md1 /dev/hdc3

But then I had a problem - the /dev/hdc3 was a spare, it didn't
resync automatically:

# mdadm -D /dev/md1
[]
Number   Major   Minor   RaidDevice State
   0   330  active sync   /dev/hda3
   1   831  active sync   /dev/sda3
   2   002  removed

   3  223-  spare   /dev/hdc3


I wanted to tell mdadm to use the spare device, and I wasn't sure how
to do this, so I tried following:

# mdadm --stop /dev/md1
# mdadm --assemble --update=resync /dev/md1 /dev/hda3 /dev/sda3 /dev/hdc3

Now, 'mdadm -D /dev/md1' says:
[...]
Number   Major   Minor   RaidDevice State
   0   330  active sync   /dev/hda3
   1   831  active sync   /dev/sda3
   3  2232  spare rebuilding   /dev/hdc3


I'm writing here just because I want to be sure that I added this new
device correctly, I don't want to make any stupid mistake here...

# cat /proc/mdstat

md1 : active raid5 hda3[0] hdc3[3] sda3[1]
  966807296 blocks super 1.1 level 5, 128k chunk, algorithm 2 [3/2] [UU_]
  [=...]  recovery =  6.2% (30068096/483403648) 
finish=254.9min speed=29639K/sec
  bitmap: 8/8 pages [32KB], 32768KB chunk

Was there a better way to do this, is it OK?

-- 
Janek Kozicki |
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: switching root fs '/' to boot from RAID1 with grub

2007-11-04 Thread H. Peter Anvin

Bill Davidsen wrote:


I don't understand your point, unless there's a Linux bootloader in the 
BIOS it will boot whatever 512 bytes are in sector 0. So if that's crap 
it doesn't matter what it would do if it was valid, some other bytes 
came off the drive instead. Maybe Windows, since there seems to be an 
option in Windows to check the boot sector on boot and rewrite it if it 
isn't the WinXP one.  One of my offspring has that problem, dual boot 
system, every time he boots Windows he has to boot from rescue and 
reinstall grub.


I think he could install grub in the partition, make that the active 
partition, and the boot would work, but he tried and only type FAT or 
VFAT seem to boot, active or not.




The Grub-promoted practice of stuffing the Linux bootloader in the MBR 
is a bad idea, but that's not the issue here.


The issue here is that the bootloader itself is capable of making the 
decision to reject a corrupt image and boot the next device.  The Linux 
kernel, unfortunately, doesn't have a sane way to do that.


-hpa
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Double mdadm --monitor == hang

2007-11-04 Thread Robin Lee Powell

Please cc me on replies; I'm not on the list.

I don't know if this is fixable, or even if it *should* be fixed,
but: summary: shortly after the state of the machine is 2
independent mdadm --monitor instances, resyncing, the
machine totally locks up.  Details follow.

I boot my machine from a USB stick, because the hard drive is
encrypted.  The start up goes something like this:

mdadm --assemble /dev/md_d0 /dev/sda /dev/sdb /dev/sdc /dev/sdd

losetup ... /dev/loop0 /dev/md_d0p2

/bin/mount -o rw -n /dev/loop0 /mnt

cd /mnt

/sbin/pivot_root . usb_boot

exec chroot . /sbin/init $* /dev/console /dev/console 21

Before it does all that, though, it starts up mdadm --monitor,
because it's got the Debian mdadm package installed, and that's how
that works.

Then in re-runs init after the pivot_root.  The OS on /dev/md_d0p2
also has the Debian mdadm package install, so it also starts mdadm
--monitor.  The two instances have no real way to see each other.

This works fine until a resync is needed; shortly after the state of
the machine is 2 mdadm --monitor instances, resyncing, the machine
totally locks up.

Solved by not starting mdadm --monitor on the USB stick, obviously,
but it took quite a while to figure out.  :(

-Robin

-- 
Lojban Reason #17: http://en.wikipedia.org/wiki/Buffalo_buffalo
Proud Supporter of the Singularity Institute - http://singinst.org/
http://www.digitalkingdom.org/~rlpowell/ *** http://www.lojban.org/
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread David Greaves
Michael Tokarev wrote:
 Justin Piszcz wrote:
 On Sun, 4 Nov 2007, Michael Tokarev wrote:
 []
 The next time you come across something like that, do a SysRq-T dump and
 post that.  It shows a stack trace of all processes - and in particular,
 where exactly each task is stuck.
 
 Yes I got it before I rebooted, ran that and then dmesg  file.

 Here it is:

 [1172609.665902]  80747dc0 80747dc0 80747dc0 
 80744d80
 [1172609.668768]  80747dc0 81015c3aa918 810091c899b4 
 810091c899a8
 
 That's only partial list.  All the kernel threads - which are most important
 in this context - aren't shown.  You ran out of dmesg buffer, and the most
 interesting entries was at the beginning.  If your /var/log partition is
 working, the stuff should be in /var/log/kern.log or equivalent.  If it's
 not working, there is a way to capture the info still, by stopping syslogd,
 cat'ing /proc/kmsg to some tmpfs file and scp'ing it elsewhere.

or netconsole is actually pretty easy and incredibly useful in this kind of
situation even if there's no disk at all :)

David

-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread Neil Brown
On Sunday November 4, [EMAIL PROTECTED] wrote:
 # ps auxww | grep D
 USER   PID %CPU %MEMVSZ   RSS TTY  STAT START   TIME COMMAND
 root   273  0.0  0.0  0 0 ?DOct21  14:40 [pdflush]
 root   274  0.0  0.0  0 0 ?DOct21  13:00 [pdflush]
 
 After several days/weeks, this is the second time this has happened, while 
 doing regular file I/O (decompressing a file), everything on the device 
 went into D-state.

At a guess (I haven't looked closely) I'd say it is the bug that was
meant to be fixed by

commit 4ae3f847e49e3787eca91bced31f8fd328d50496

except that patch applied badly and needed to be fixed with
the following patch (not in git yet).
These have been sent to stable@ and should be in the queue for 2.6.23.2


NeilBrown

Fix misapplied patch in raid5.c

commit 4ae3f847e49e3787eca91bced31f8fd328d50496 did not get applied
correctly, presumably due to substantial similarities between
handle_stripe5 and handle_stripe6.

This patch (with lots of context) moves the chunk of new code from
handle_stripe6 (where it isn't needed (yet)) to handle_stripe5.


Signed-off-by: Neil Brown [EMAIL PROTECTED]

### Diffstat output
 ./drivers/md/raid5.c |   14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c
--- .prev/drivers/md/raid5.c2007-11-02 12:10:49.0 +1100
+++ ./drivers/md/raid5.c2007-11-02 12:25:31.0 +1100
@@ -2607,40 +2607,47 @@ static void handle_stripe5(struct stripe
struct bio *return_bi = NULL;
struct stripe_head_state s;
struct r5dev *dev;
unsigned long pending = 0;
 
memset(s, 0, sizeof(s));
pr_debug(handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d 
ops=%lx:%lx:%lx\n, (unsigned long long)sh-sector, sh-state,
atomic_read(sh-count), sh-pd_idx,
sh-ops.pending, sh-ops.ack, sh-ops.complete);
 
spin_lock(sh-lock);
clear_bit(STRIPE_HANDLE, sh-state);
clear_bit(STRIPE_DELAYED, sh-state);
 
s.syncing = test_bit(STRIPE_SYNCING, sh-state);
s.expanding = test_bit(STRIPE_EXPAND_SOURCE, sh-state);
s.expanded = test_bit(STRIPE_EXPAND_READY, sh-state);
/* Now to look around and see what can be done */
 
+   /* clean-up completed biofill operations */
+   if (test_bit(STRIPE_OP_BIOFILL, sh-ops.complete)) {
+   clear_bit(STRIPE_OP_BIOFILL, sh-ops.pending);
+   clear_bit(STRIPE_OP_BIOFILL, sh-ops.ack);
+   clear_bit(STRIPE_OP_BIOFILL, sh-ops.complete);
+   }
+
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
struct r5dev *dev = sh-dev[i];
clear_bit(R5_Insync, dev-flags);
 
pr_debug(check %d: state 0x%lx toread %p read %p write %p 
written %p\n, i, dev-flags, dev-toread, dev-read,
dev-towrite, dev-written);
 
/* maybe we can request a biofill operation
 *
 * new wantfill requests are only permitted while
 * STRIPE_OP_BIOFILL is clear
 */
if (test_bit(R5_UPTODATE, dev-flags)  dev-toread 
!test_bit(STRIPE_OP_BIOFILL, sh-ops.pending))
set_bit(R5_Wantfill, dev-flags);
 
/* now count some things */
@@ -2880,47 +2887,40 @@ static void handle_stripe6(struct stripe
struct stripe_head_state s;
struct r6_state r6s;
struct r5dev *dev, *pdev, *qdev;
 
r6s.qd_idx = raid6_next_disk(pd_idx, disks);
pr_debug(handling stripe %llu, state=%#lx cnt=%d, 
pd_idx=%d, qd_idx=%d\n,
   (unsigned long long)sh-sector, sh-state,
   atomic_read(sh-count), pd_idx, r6s.qd_idx);
memset(s, 0, sizeof(s));
 
spin_lock(sh-lock);
clear_bit(STRIPE_HANDLE, sh-state);
clear_bit(STRIPE_DELAYED, sh-state);
 
s.syncing = test_bit(STRIPE_SYNCING, sh-state);
s.expanding = test_bit(STRIPE_EXPAND_SOURCE, sh-state);
s.expanded = test_bit(STRIPE_EXPAND_READY, sh-state);
/* Now to look around and see what can be done */
 
-   /* clean-up completed biofill operations */
-   if (test_bit(STRIPE_OP_BIOFILL, sh-ops.complete)) {
-   clear_bit(STRIPE_OP_BIOFILL, sh-ops.pending);
-   clear_bit(STRIPE_OP_BIOFILL, sh-ops.ack);
-   clear_bit(STRIPE_OP_BIOFILL, sh-ops.complete);
-   }
-
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
dev = sh-dev[i];
clear_bit(R5_Insync, dev-flags);
 
pr_debug(check %d: state 0x%lx read %p write %p written %p\n,
i, dev-flags, dev-toread, dev-towrite, dev-written);
/* maybe we can reply to a read */

Re: 2.6.23.1: mdadm/raid5 hung/d-state

2007-11-04 Thread Justin Piszcz



On Mon, 5 Nov 2007, Neil Brown wrote:


On Sunday November 4, [EMAIL PROTECTED] wrote:

# ps auxww | grep D
USER   PID %CPU %MEMVSZ   RSS TTY  STAT START   TIME COMMAND
root   273  0.0  0.0  0 0 ?DOct21  14:40 [pdflush]
root   274  0.0  0.0  0 0 ?DOct21  13:00 [pdflush]

After several days/weeks, this is the second time this has happened, while
doing regular file I/O (decompressing a file), everything on the device
went into D-state.


At a guess (I haven't looked closely) I'd say it is the bug that was
meant to be fixed by

commit 4ae3f847e49e3787eca91bced31f8fd328d50496

except that patch applied badly and needed to be fixed with
the following patch (not in git yet).
These have been sent to stable@ and should be in the queue for 2.6.23.2



Ah, thanks Neil, will be updating as soon as it is released, thanks.

Justin.

-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html