We've had some trouble with the AMI sometimes locking up.  During this time it won't  new connections, and doesn't send out sending out data.  Running asterisk -rx "manager show eventq" can grow to over 100K events, and many megabytes of content.  Sometimes the problem seems to resolve itself, though not all the time. Recently we had the AMI lock up for over an our and only restarting asterisk resolved the situation.  This seems to also affect incoming calls, which are can be blocked.

We did some gdb thread dumps on Asterisk, and found two problem threads.  One thread is stuck attempting to obtain a lock in manager.c purge_sessions, the other is holding a session lock in manager.c process_events.  I dumped the thread twice on the same AMI lockup, and both stuck threads were stuck in the same spot.

In addition, we found the sendq on the Asterisk side of the AMI connection to be very high.  After we killed this connection, the AMI recovered.  We've long thought this problem is related to the below Asterisk issue, and finding this is further evidence.

https://issues.asterisk.org/jira/browse/ASTERISK-28469



Code snippets:

One thread is stuck waiting to acquire a session lock to see if it should be purged:

static void purge_sessions(int n_max)
{
    struct ao2_container *sessions;
    struct mansession_session *session;
    time_t now = time(NULL);
    struct ao2_iterator i;

    sessions = ao2_global_obj_ref(mgr_sessions);
    if (!sessions) {
        return;
    }
    i = ao2_iterator_init(sessions, 0);
    ao2_ref(sessions, -1);
    while ((session = ao2_iterator_next(&i)) && n_max > 0) {
        ao2_lock(session);   *****STUCK HERE*****
        if (session->sessiontimeout && (now > session->sessiontimeout) && !session->inuse) {
            if (session->authenticated
                && VERBOSITY_ATLEAST(2)
                && manager_displayconnects(session)) {
                ast_verb(2, "HTTP Manager '%s' timed out from %s\n",
                    session->username, ast_sockaddr_stringify_addr(&session->addr));
            }
            ao2_unlock(session);
            session_destroy(session);
            n_max--;
        } else {
            ao2_unlock(session);
            unref_mansession(session);
        }
    }
    ao2_iterator_destroy(&i);
}



Thread holding the lock:

static int process_events(struct mansession *s)
{
    int ret = 0;

    ao2_lock(s->session);
    if (s->session->stream != NULL) {
        struct eventqent *eqe = s->session->last_ev;

        while ((eqe = advance_event(eqe))) {
            if (eqe->category == EVENT_FLAG_SHUTDOWN) {
                ast_debug(3, "Received CloseSession event\n");
                ret = -1;
            }
            if (!ret && s->session->authenticated &&
                (s->session->readperm & eqe->category) == eqe->category &&
                (s->session->send_events & eqe->category) == eqe->category) {
                    if (match_filter(s, eqe->eventdata)) {
                        if (send_string(s, eqe->eventdata) < 0)  ***** STUCK HERE *****
                            ret = -1;    /* don't send more */
                    }
            }
            s->session->last_ev = eqe;
        }
    }
    ao2_unlock(s->session);
    return ret;
}

If I trace back send_string via my thread dump,  it goes to:

int ast_wait_for_output(int fd, int ms)
{
    struct pollfd pfd[1];

    memset(pfd, 0, sizeof(pfd));
    pfd[0].fd = fd;
    pfd[0].events = POLLOUT;
    return ast_poll(pfd, 1, ms);   ****STUCK HERE****
}


Which ultimately calls a function in glibc, poll.c.  I haven't written C for in much capacity for 20 years, so this is largely where my ability to debug ends.

I don't know why this never times out.  I did notice that there's a timeout that's being set in send_string that uses writetimout. We leave that at the default of 100ms in manager.conf.


We're running Centos 6, with glibc-2.12-1.212.el6_10.3.x86_64, and Asterisk 16.8-cert5.



Here's the dump of the two threads (there's another dump as well I took a few minutes later that's very similar)

Thread 99 (Thread 0x7fbee6b87700 (LWP 24931)):
#0  __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
No locals.
#1  0x00000039c04095f3 in _L_lock_892 () from /lib64/libpthread.so.0
No symbol table info available.
#2  0x00000039c04094d7 in __pthread_mutex_lock (mutex=0x7fbf18012900) at pthread_mutex_lock.c:82
        ignore1 = 128
        ignore2 = 402729216
        ignore3 = -512
        type = 402729216
        id = 24931
#3  0x000000000045b7a6 in __ao2_lock (user_data=<value optimized out>, lock_how=<value optimized out>, file=0x66aa32 "manager.c", func=0x66f388 "purge_sessions", line=7047, var=<value optimized out>) at astobj2.c:241
        obj = <value optimized out>
        obj_mutex = <value optimized out>
        obj_rwlock = <value optimized out>
        res = 0
#4  0x00000000005f448a in purge_sessions (data=<value optimized out>) at manager.c:7047         i = {c = 0x1140d30, last_node = 0x7fbf18004058, complete = 0, flags = 0}
        n_max = 1
        sessions = <value optimized out>
        session = 0x7fbf18012940
        now = 1609358394
#5  purge_old_stuff (data=<value optimized out>) at manager.c:8640
No locals.
#6  0x000000000058cc41 in ast_tcptls_server_root (data=0x8d9c20) at tcptls.c:270
        i = <value optimized out>
        desc = 0x8d9c20
        fd = -1
        addr = {ss = {ss_family = 2, __ss_padding = "\200\320\330\021\000\374\000\000\000\000\000\000\000\000\260\b\000Ⱦ\177\000\000 \000\000\000\000\000\000\000 \341\070\300\071\000\000\000\355\247\a\300\071", '\000' <repeats 11 times>"\340, \370\024\001\000\000\000\000\021\220a\000\000\000\000\000\230\001\000\000\000\000\000\000p\325a", '\000' <repeats 13 times>, "\003\000\000\000\000\000\000\000v\254E", '\000' <repeats 12 times>, __ss_align = 0}, len = 16}
        tcptls_session = <value optimized out>
        launched = 140456899585792
        __PRETTY_FUNCTION__ = "ast_tcptls_server_root"
        __FUNCTION__ = "ast_tcptls_server_root"
#7  0x000000000059bc77 in dummy_start (data=<value optimized out>) at utils.c:1249         __clframe = {__cancel_routine = <value optimized out>, __cancel_arg = 0x7fbee6b87700, __do_it = 1, __cancel_type = 0}
        ret = <value optimized out>
        a = {start_routine = 0x58cbf0 <ast_tcptls_server_root>, data = 0x8d9c20, name = <value optimized out>}
        __PRETTY_FUNCTION__ = "dummy_start"
#8  0x00000039c0407aa1 in start_thread (arg=0x7fbee6b87700) at pthread_create.c:301
        __res = <value optimized out>
        pd = 0x7fbee6b87700
        now = <value optimized out>
        unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140457891362560, -1282427221842092215, 140736079386656, 140457891363264, 0, 3, 1246980655494963017, -1278908234687602871}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, cleanup = 0x0, canceltype = 0}}}
        not_first_call = <value optimized out>
        pagesize_m1 = <value optimized out>
        sp = <value optimized out>
        freesize = <value optimized out>
#9  0x00000039c00e8c4d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
No locals.



Thread 36 (Thread 0x7fbee634b700 (LWP 20342)):
#0  0x00000039c00df403 in __poll (fds=<value optimized out>, nfds=<value optimized out>, timeout=<value optimized out>) at ../sysdeps/unix/sysv/linux/poll.c:87
        __arg2 = 1
        _a3 = 100
        _a1 = 140457882722704
        resultvar = <value optimized out>
        __arg3 = 100
        __arg1 = 140457882722704
        _a2 = 1
        resultvar = <value optimized out>
        oldtype = 0
        result = <value optimized out>
#1  0x000000000059cc27 in ast_wait_for_output (fd=<value optimized out>, ms=<value optimized out>) at utils.c:1357
        pfd = {{fd = 29, events = 4, revents = 0}}
#2  0x00000000004f2621 in ast_iostream_write (stream=0x7fbec8004598, buffer=0x7fbe94f22358, size=421) at iostream.c:487
        start = {tv_sec = 1609360234, tv_usec = 335812}
        ms = <value optimized out>
        res = <value optimized out>
        written = 0
        remaining = 421
        __PRETTY_FUNCTION__ = "ast_iostream_write"
#3  0x00000000005ecbfd in send_string (s=0x7fbee634acd0) at manager.c:3041
        stream = 0x7fbec8004598
        len = 421
        res = <value optimized out>
#4  process_events (s=0x7fbee634acd0) at manager.c:6241
        eqe = <value optimized out>
        ret = 0
        __PRETTY_FUNCTION__ = "process_events"
#5  0x0000000000602a6f in do_message (data=0x7fbec8002d60) at manager.c:6865
        m = {hdrcount = 0, headers = {0x0 <repeats 128 times>}}
        hdr_loss = 0
        header_buf = '\000' <repeats 1024 times>
        res = <value optimized out>
        now = 1609332593
#6  session_do (data=0x7fbec8002d60) at manager.c:7004
        ser = 0x7fbec8002d60
        session = 0x7fbf18012940
        s = {session = 0x7fbf18012940, stream = 0x7fbec8004598, tcptls_session = 0x7fbec8002d60, parsing = MESSAGE_OKAY, write_error = 1, hook = 0x0, lock = {mutex = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 1, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 16 times>, "\001", '\000' <repeats 22 times>, __align = 0}, _track = 0x0, _flags = {tracking = 0, setup = 0}}}
        res = -1
        arg = 1
        ser_remote_address_tmp = {ss = {ss_family = 2, __ss_padding = "\256\212\330\021\000\374\000\000\000\000\000\000\000\000[\000\000\000\003\000\000\000 \000\000\030\277\177\000\000`\242a\000\000\000\000\000\200<\002\000\000\000\000\000\200S\001\030\277\177\000\000\004\000\000\000\000\000\000\000 \341\070\300\071\000\000\000\355\247\a\300\071\000\000\000 \341\070\300\071\000\000\000\260Y\215\000\000\000\000\000`\242a\000\000\000\000\000\305\000\000\000\000\000\000\000\360\260e\000\000\000\000", __ss_align = 140457375968976}, len = 16}
        __PRETTY_FUNCTION__ = "session_do"
#7  0x000000000059bc77 in dummy_start (data=<value optimized out>) at utils.c:1249         __clframe = {__cancel_routine = <value optimized out>, __cancel_arg = 0x7fbee634b700, __do_it = 1, __cancel_type = 0}
        ret = <value optimized out>
        a = {start_routine = 0x58d2c0 <handle_tcptls_connection>, data = 0x7fbec8002d60, name = <value optimized out>}
        __PRETTY_FUNCTION__ = "dummy_start"
#8  0x00000039c0407aa1 in start_thread (arg=0x7fbee634b700) at pthread_create.c:301
        __res = <value optimized out>
        pd = 0x7fbee634b700
        now = <value optimized out>
        unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140457882728192, -1282427221842092215, 140457891359648, 140457882728896, 140457375968976, 3, 1246981791513812809, -1278908234687602871}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, cleanup = 0x0, canceltype = 0}}}
        not_first_call = <value optimized out>
        pagesize_m1 = <value optimized out>
        sp = <value optimized out>
        freesize = <value optimized out>
#9  0x00000039c00e8c4d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
No locals.





--

-- 
_____________________________________________________________________
-- Bandwidth and Colocation Provided by http://www.api-digital.com --

asterisk-dev mailing list
To UNSUBSCRIBE or update options visit:
   http://lists.digium.com/mailman/listinfo/asterisk-dev

Reply via email to