I'm seeing the following bizarre behavior...

  - "rsync -av" from one local fs to another local fs (local disk on both)
  - rsync gets to very near completion then hangs indefinitely
  - attach strace to rsync process doing the 'copy from' and
    it starts up again and finishes!

This is on a linux system with kernel 2.4.5-ac9.  I haven't had a chance
to try another kernel yet... I will try to do so tonight.  It does sound
like a kernel select/poll bug, I supose, but I thought I'd check the
list to see if anyone has any ideas.

Here's the output of the strace... the rsync has been hung for 10 minutes.
First I'm attaching an strace to the 'copy to' process, which is just 
sitting in select, then I ^C out of that and attach to the 'copy from'
process and it starts right up:

[root@nova /u]# strace -p 16354
select(7, NULL, [6], NULL, {26, 150000} <unfinished ...>
^C

[root@nova /u]# strace -p 15409
select(2, NULL, [1], NULL, {53, 610000}) = 1 (out [1], left {53, 580000})
write(1, "%\0\0\tusr/X11R6/lib/X11/locale/iso"..., 41) = 41
select(6, [3 5], NULL, NULL, {60, 0})   = 1 (in [5], left {60, 0})
select(6, [5], NULL, NULL, {60, 0})     = 1 (in [5], left {60, 0})
read(5, "$\0\0\t", 4)                   = 4
select(6, [5], NULL, NULL, {60, 0})     = 1 (in [5], left {60, 0})
read(5, "usr/X11R6/lib/X11/locale/iso8859"..., 36) = 36
select(2, NULL, [1], NULL, {60, 0})     = 1 (out [1], left {60, 0})
write(1, "$\0\0\tusr/X11R6/lib/X11/locale/iso"..., 40) = 40
select(6, [3 5], NULL, NULL, {60, 0})   = 1 (in [5], left {60, 0})

  ...more of the same...  following are the last few lines before 
     process exits...

select(6, [5], [1], NULL, {60, 0})      = 2 (in [5], out [1], left {60, 0})
select(6, [5], NULL, NULL, {60, 0})     = 1 (in [5], left {60, 0})
read(5, "\24\0\0\t", 4)                 = 4
select(6, [5], NULL, NULL, {60, 0})     = 1 (in [5], left {60, 0})
read(5, "var/cache/man/cat7/\n", 20)    = 20
select(2, NULL, [1], NULL, {60, 0})     = 1 (out [1], left {60, 0})
write(1, "\24\0\0\tvar/cache/man/cat7/\n", 24) = 24
write(1, "\4\0\0\7\377\377\377\377", 8 <unfinished ...>
--- SIGHUP (Hangup) ---
--- SIGCONT (Continued) ---
<... write resumed> )                   = -1 ENOSYS (Function not implemented)
rt_sigaction(SIGUSR2, {SIG_IGN}, {0x8050590, [USR2], SA_RESTART|0x4000000}, 8) 
= 0
getpid()                                = 15409
kill(16354, SIGUSR1)                    = 0
_exit(20)                               = ?

Any thoughts?

--jurgen






Reply via email to