Hi Anand,

ulimit -l running as root is 64.


This dmesg out is from the second system.

I don't see any new on the first system other that what were there when system booted. Do you want to see the whole dmesg output? Where should I post it, there are 1600 lines.

...
ling

INFO: task glusterfs:8880 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfs     D 0000000000000000     0  8880      1 0x00000080
 ffff880614b75e48 0000000000000086 0000000000000000 ffff88010ed65d80
 000000000000038b 000000000000038b ffff880614b75ee8 ffffffff814ef8f5
 ffff88062bc4ba78 ffff880614b75fd8 000000000000f4e8 ffff88062bc4ba78
Call Trace:
 [<ffffffff814ef8f5>] ? page_fault+0x25/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81141768>] sys_munmap+0x48/0x80
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfs:8880 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfs     D 0000000000000000     0  8880      1 0x00000080
 ffff880614b75e48 0000000000000086 0000000000000000 ffff88010ed65d80
 000000000000038b 000000000000038b ffff880614b75ee8 ffffffff814ef8f5
 ffff88062bc4ba78 ffff880614b75fd8 000000000000f4e8 ffff88062bc4ba78
Call Trace:
 [<ffffffff814ef8f5>] ? page_fault+0x25/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81141768>] sys_munmap+0x48/0x80
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfs:8880 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfs     D 0000000000000009     0  8880      1 0x00000080
 ffff880614b75e08 0000000000000086 0000000000000000 ffff88062d638338
 ffff880c30ef88c0 ffffffff8120d34f ffff880614b75d98 ffff88061406f740
 ffff88062bc4ba78 ffff880614b75fd8 000000000000f4e8 ffff88062bc4ba78
Call Trace:
 [<ffffffff8120d34f>] ? security_inode_permission+0x1f/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81131ddc>] sys_mmap_pgoff+0x5c/0x2d0
 [<ffffffff81010469>] sys_mmap+0x29/0x30
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfs:8880 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfs     D 0000000000000009     0  8880      1 0x00000080
 ffff880614b75e08 0000000000000086 0000000000000000 ffff88062d638338
 ffff880c30ef88c0 ffffffff8120d34f ffff880614b75d98 ffff88061406f740
 ffff88062bc4ba78 ffff880614b75fd8 000000000000f4e8 ffff88062bc4ba78
Call Trace:
 [<ffffffff8120d34f>] ? security_inode_permission+0x1f/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81131ddc>] sys_mmap_pgoff+0x5c/0x2d0
 [<ffffffff81010469>] sys_mmap+0x29/0x30
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfs:8880 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfs     D 0000000000000003     0  8880      1 0x00000080
 ffff880614b75e08 0000000000000086 0000000000000000 ffff880630ab1ab8
 ffff880c30ef88c0 ffffffff8120d34f ffff880614b75d98 ffff88062df10480
 ffff88062bc4ba78 ffff880614b75fd8 000000000000f4e8 ffff88062bc4ba78
Call Trace:
 [<ffffffff8120d34f>] ? security_inode_permission+0x1f/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81131ddc>] sys_mmap_pgoff+0x5c/0x2d0
 [<ffffffff81010469>] sys_mmap+0x29/0x30
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfsd:9471 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfsd    D 0000000000000004     0  9471      1 0x00000080
 ffff8801077c3740 0000000000000082 0000000000000000 ffff8801077c36b8
 ffffffff8127f138 0000000000000000 0000000000000000 ffff8801077c36d8
 ffff8806146f4638 ffff8801077c3fd8 000000000000f4e8 ffff8806146f4638
Call Trace:
 [<ffffffff8127f138>] ? swiotlb_dma_mapping_error+0x18/0x30
 [<ffffffff8127f138>] ? swiotlb_dma_mapping_error+0x18/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffffa019607a>] ? ixgbe_xmit_frame_ring+0x93a/0xfc0 [ixgbe]
 [<ffffffff814ef1f6>] rwsem_down_read_failed+0x26/0x30
 [<ffffffff81276e84>] call_rwsem_down_read_failed+0x14/0x30
 [<ffffffff814ee6f4>] ? down_read+0x24/0x30
 [<ffffffff81042bc7>] __do_page_fault+0x187/0x480
 [<ffffffff81430c38>] ? dev_queue_xmit+0x178/0x6b0
 [<ffffffff8146809c>] ? ip_finish_output+0x13c/0x310
 [<ffffffff814f253e>] do_page_fault+0x3e/0xa0
 [<ffffffff814ef8f5>] page_fault+0x25/0x30
 [<ffffffff81275a6d>] ? copy_user_generic_string+0x2d/0x40
 [<ffffffff81425655>] ? memcpy_toiovec+0x55/0x80
 [<ffffffff81426070>] skb_copy_datagram_iovec+0x60/0x2c0
 [<ffffffff8141ceac>] ? lock_sock_nested+0xac/0xc0
 [<ffffffff814ef5cb>] ? _spin_unlock_bh+0x1b/0x20
 [<ffffffff814722d5>] tcp_recvmsg+0xca5/0xe90
 [<ffffffff814925ea>] inet_recvmsg+0x5a/0x90
 [<ffffffff8141bff1>] sock_aio_read+0x181/0x190
 [<ffffffff810566a3>] ? perf_event_task_sched_out+0x33/0x80
 [<ffffffff8100988e>] ? __switch_to+0x26e/0x320
 [<ffffffff8141be70>] ? sock_aio_read+0x0/0x190
 [<ffffffff8117614b>] do_sync_readv_writev+0xfb/0x140
 [<ffffffff81090a90>] ? autoremove_wake_function+0x0/0x40
 [<ffffffff8120c1e6>] ? security_file_permission+0x16/0x20
 [<ffffffff811771df>] do_readv_writev+0xcf/0x1f0
 [<ffffffff811b9b50>] ? sys_epoll_wait+0xa0/0x300
 [<ffffffff814ecb0e>] ? thread_return+0x4e/0x760
 [<ffffffff81177513>] vfs_readv+0x43/0x60
 [<ffffffff81177641>] sys_readv+0x51/0xb0
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfsd:9545 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfsd    D 0000000000000006     0  9545      1 0x00000080
 ffff880c24a7bcf8 0000000000000082 0000000000000000 ffffffff8107c0a0
 ffff88066a0a7580 ffff880c30460000 0000000000000000 0000000000000000
 ffff88066a0a7b38 ffff880c24a7bfd8 000000000000f4e8 ffff88066a0a7b38
Call Trace:
 [<ffffffff8107c0a0>] ? process_timeout+0x0/0x10
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff8127f18c>] ? is_swiotlb_buffer+0x3c/0x50
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffffa0211b96>] ib_umem_release+0x76/0x110 [ib_core]
 [<ffffffffa0230d52>] mlx4_ib_dereg_mr+0x32/0x50 [mlx4_ib]
 [<ffffffffa020cd85>] ib_dereg_mr+0x35/0x50 [ib_core]
 [<ffffffffa041bc5b>] ib_uverbs_dereg_mr+0x7b/0xf0 [ib_uverbs]
 [<ffffffffa04194ef>] ib_uverbs_write+0xbf/0xe0 [ib_uverbs]
 [<ffffffff8117646d>] ? rw_verify_area+0x5d/0xc0
 [<ffffffff81176588>] vfs_write+0xb8/0x1a0
 [<ffffffff810d4692>] ? audit_syscall_entry+0x272/0x2a0
 [<ffffffff81176f91>] sys_write+0x51/0x90
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfsd:9546 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfsd    D 0000000000000004     0  9546      1 0x00000080
 ffff880c0634bcf0 0000000000000082 ffff880c0634bcb8 ffff880c0634bcb4
 0000000000015f80 ffff88063fc24b00 ffff880655495f80 0000000000000400
 ffff880c2dccc5f8 ffff880c0634bfd8 000000000000f4e8 ffff880c2dccc5f8
Call Trace:
 [<ffffffff810566a3>] ? perf_event_task_sched_out+0x33/0x80
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff810097cc>] ? __switch_to+0x1ac/0x320
 [<ffffffff814ef1f6>] rwsem_down_read_failed+0x26/0x30
 [<ffffffff814ecb0e>] ? thread_return+0x4e/0x760
 [<ffffffff81276e84>] call_rwsem_down_read_failed+0x14/0x30
 [<ffffffff814ee6f4>] ? down_read+0x24/0x30
 [<ffffffff81042bc7>] __do_page_fault+0x187/0x480
 [<ffffffffa0419e16>] ? ib_uverbs_event_read+0x1d6/0x240 [ib_uverbs]
 [<ffffffff814f253e>] do_page_fault+0x3e/0xa0
 [<ffffffff814ef8f5>] page_fault+0x25/0x30
INFO: task glusterfsd:9553 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfsd    D 000000000000000e     0  9553      1 0x00000080
 ffff8806e131dd98 0000000000000082 0000000000000000 ffff8806e131dd64
 ffff8806e131dd48 ffffffffa026dfb6 ffff8806e131dd28 ffffffff00000000
 ffff880c2f41c678 ffff8806e131dfd8 000000000000f4e8 ffff880c2f41c678
Call Trace:
 [<ffffffffa026dfb6>] ? xfs_attr_get+0xb6/0xc0 [xfs]
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81136009>] sys_madvise+0x329/0x760
 [<ffffffff81195740>] ? mntput_no_expire+0x30/0x110
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b
INFO: task glusterfs:8880 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
glusterfs     D 0000000000000003     0  8880      1 0x00000080
 ffff880614b75e08 0000000000000086 0000000000000000 ffff880630ab1ab8
 ffff880c30ef88c0 ffffffff8120d34f ffff880614b75d98 ffff88062df10480
 ffff88062bc4ba78 ffff880614b75fd8 000000000000f4e8 ffff88062bc4ba78
Call Trace:
 [<ffffffff8120d34f>] ? security_inode_permission+0x1f/0x30
 [<ffffffff814ef065>] rwsem_down_failed_common+0x95/0x1d0
 [<ffffffff814ef1c3>] rwsem_down_write_failed+0x23/0x30
 [<ffffffff81276eb3>] call_rwsem_down_write_failed+0x13/0x20
 [<ffffffff814ee6c2>] ? down_write+0x32/0x40
 [<ffffffff81131ddc>] sys_mmap_pgoff+0x5c/0x2d0
 [<ffffffff81010469>] sys_mmap+0x29/0x30
 [<ffffffff8100b0f2>] system_call_fastpath+0x16/0x1b


On 06/08/2012 05:18 PM, Anand Avati wrote:
Those are 4.x GB. Can you post dmesg output as well? Also, what's 'ulimit -l' on your system?

On Fri, Jun 8, 2012 at 4:41 PM, Ling Ho <[email protected] <mailto:[email protected]>> wrote:


    This is the core file from the crash just now

    [root@psanaoss213 /]# ls -al core*
    -rw------- 1 root root 4073594880 <tel:4073594880> Jun  8 15:05
    core.22682

    From yesterday:
    [root@psanaoss214 /]# ls -al core*
    -rw------- 1 root root 4362727424 Jun  8 00:58 core.13483
    -rw------- 1 root root 4624773120 Jun  8 03:21 core.8792



    On 06/08/2012 04:34 PM, Anand Avati wrote:
    Is it possible the system was running low on memory? I see you
    have 48GB, but memory registration failure typically would be
    because the system limit on the number of pinnable pages in RAM
    was hit. Can you tell us the size of your core dump files after
    the crash?

    Avati

    On Fri, Jun 8, 2012 at 4:22 PM, Ling Ho <[email protected]
    <mailto:[email protected]>> wrote:

        Hello,

        I have a brick that crashed twice today, and another
        different brick that crashed just a while a go.

        This is what I see in one of the brick logs:

        patchset: git://git.gluster.com/glusterfs.git
        <http://git.gluster.com/glusterfs.git>
        patchset: git://git.gluster.com/glusterfs.git
        <http://git.gluster.com/glusterfs.git>
        signal received: 6
        signal received: 6
        time of crash: 2012-06-08 15:05:11
        configuration details:
        argp 1
        backtrace 1
        dlfcn 1
        fdatasync 1
        libpthread 1
        llistxattr 1
        setfsid 1
        spinlock 1
        epoll.h 1
        xattr.h 1
        st_atim.tv_nsec 1
        package-string: glusterfs 3.2.6
        /lib64/libc.so.6[0x34bc032900]
        /lib64/libc.so.6(gsignal+0x35)[0x34bc032885]
        /lib64/libc.so.6(abort+0x175)[0x34bc034065]
        /lib64/libc.so.6[0x34bc06f977]
        /lib64/libc.so.6[0x34bc075296]
        
/opt/glusterfs/3.2.6/lib64/libglusterfs.so.0(__gf_free+0x44)[0x7f1740ba25e4]
        
/opt/glusterfs/3.2.6/lib64/libgfrpc.so.0(rpc_transport_destroy+0x47)[0x7f1740956967]
        
/opt/glusterfs/3.2.6/lib64/libgfrpc.so.0(rpc_transport_unref+0x62)[0x7f1740956a32]
        
/opt/glusterfs/3.2.6/lib64/glusterfs/3.2.6/rpc-transport/rdma.so(+0xc135)[0x7f173ca27135]
        /lib64/libpthread.so.0[0x34bc8077f1]
        /lib64/libc.so.6(clone+0x6d)[0x34bc0e5ccd]
        ---------

        And somewhere before these, there is also
        [2012-06-08 15:05:07.512604] E [rdma.c:198:rdma_new_post]
        0-rpc-transport/rdma: memory registration failed

        I have 48GB of memory on the system:

        # free
                    total       used       free     shared    buffers
            cached
Mem: 49416716 34496648 14920068 0 31692 28209612
        -/+ buffers/cache:    6255344   43161372
        Swap:      4194296 1740 4192556 <tel:1740%20%20%20%204192556>

        # uname -a
        Linux psanaoss213 2.6.32-220.7.1.el6.x86_64 #1 SMP Fri Feb 10
        15:22:22 EST 2012 x86_64 x86_64 x86_64 GNU/Linux

        The server gluster versions is 3.2.6-1. I am using have both
        rdma clients and tcp clients over 10Gb/s network.

        Any suggestion what I should look for?

        Is there a way to just restart the brick, and not glusterd on
        the server? I have 8 bricks on the server.

        Thanks,
        ...
        ling


        Here's the volume info:

        # gluster volume info

        Volume Name: ana12
        Type: Distribute
        Status: Started
        Number of Bricks: 40
        Transport-type: tcp,rdma
        Bricks:
        Brick1: psanaoss214:/brick1
        Brick2: psanaoss214:/brick2
        Brick3: psanaoss214:/brick3
        Brick4: psanaoss214:/brick4
        Brick5: psanaoss214:/brick5
        Brick6: psanaoss214:/brick6
        Brick7: psanaoss214:/brick7
        Brick8: psanaoss214:/brick8
        Brick9: psanaoss211:/brick1
        Brick10: psanaoss211:/brick2
        Brick11: psanaoss211:/brick3
        Brick12: psanaoss211:/brick4
        Brick13: psanaoss211:/brick5
        Brick14: psanaoss211:/brick6
        Brick15: psanaoss211:/brick7
        Brick16: psanaoss211:/brick8
        Brick17: psanaoss212:/brick1
        Brick18: psanaoss212:/brick2
        Brick19: psanaoss212:/brick3
        Brick20: psanaoss212:/brick4
        Brick21: psanaoss212:/brick5
        Brick22: psanaoss212:/brick6
        Brick23: psanaoss212:/brick7
        Brick24: psanaoss212:/brick8
        Brick25: psanaoss213:/brick1
        Brick26: psanaoss213:/brick2
        Brick27: psanaoss213:/brick3
        Brick28: psanaoss213:/brick4
        Brick29: psanaoss213:/brick5
        Brick30: psanaoss213:/brick6
        Brick31: psanaoss213:/brick7
        Brick32: psanaoss213:/brick8
        Brick33: psanaoss215:/brick1
        Brick34: psanaoss215:/brick2
        Brick35: psanaoss215:/brick4
        Brick36: psanaoss215:/brick5
        Brick37: psanaoss215:/brick7
        Brick38: psanaoss215:/brick8
        Brick39: psanaoss215:/brick3
        Brick40: psanaoss215:/brick6
        Options Reconfigured:
        performance.io-thread-count: 16
        performance.write-behind-window-size: 16MB
        performance.cache-size: 1GB
        nfs.disable: on
        performance.cache-refresh-timeout: 1
        network.ping-timeout: 42
        performance.cache-max-file-size: 1PB

        _______________________________________________
        Gluster-users mailing list
        [email protected] <mailto:[email protected]>
        http://gluster.org/cgi-bin/mailman/listinfo/gluster-users





_______________________________________________
Gluster-users mailing list
[email protected]
http://gluster.org/cgi-bin/mailman/listinfo/gluster-users

Reply via email to