On 03/13/2013 02:18 PM, McFarland, Jeffrey wrote: > Here are the values from another sort that has been running for over 12 hours > now. This time that second argument (number of threads) looks fine in all > three cases. And this time there are no zombie threads. > >> : pstack 20632 > 20632: /usr/local/abacus/etsort/sort -tn -S 295063 --batch-size=100 -T > /disk/ > ----------------- lwp# 1 / thread# 1 -------------------- > ffffffff7eadc810 lwp_wait (f2, ffffffff7fffea9c) > ffffffff7ead4d74 _thrp_join (f2, 0, 0, 1, ffffffff7fffeca0, > ffffffff7fffea9c) + 38 > 000000010000f2f4 sortlines (110137e90, 8, 7194a, 11015bfe0, > ffffffff7fffeca0, 100136240) + 174 > 0000000100010144 sort (100137cd0, 1, ffffffff7ffff660, 8, ffffffff7fffeeac, > ffffffff7ed00200) + 2f0 > 0000000100012bf4 main (13, ffffffff7ffff1f8, ffffffff7ffff298, 100136ca8, > 100000000, ffffffff7ed00200) + 21cc > 0000000100004ca4 _start (0, 0, 0, 0, 0, 0) + 7c > ----------------- lwp# 242 / thread# 242 -------------------- > ffffffff7eadc810 lwp_wait (f4, ffffffff7e1fbd2c) > ffffffff7ead4d74 _thrp_join (f4, 0, 0, 1, ffffffff7fffeca0, > ffffffff7e1fbd2c) + 38 > 000000010000f2f4 sortlines (110137e90, 4, 7194a, 11015c050, > ffffffff7fffeca0, 100136240) + 174 > 000000010000f168 sortlines_thread (ffffffff7fffeb60, 1fc000, 0, 0, > 10000f104, 0) + 64 > ffffffff7ead8778 _lwp_start (0, 0, 0, 0, 0, 0)
> ----------------- lwp# 244 / thread# 244 -------------------- > ffffffff7ead8818 lwp_park (0, 0, 0) > 000000010000e710 lock_node (11015c360, 10f691fb0, ffffffff7ec4a300, > ffffffff7fffecac, ffffffff7ed00a00, 0) + 14 > 000000010000efbc queue_check_insert_parent (ffffffff7fffeca0, 11015c3d0, > 100136240, 1101597dd, ffffffff7ed00a00, 1c00) + 2c > 000000010000f0e8 merge_loop (ffffffff7fffeca0, 7194a, 100136240, 1101597dd, > ffffffff7eacff0c, 3) + 90 > 000000010000f43c sortlines (110137e90, 2, 7194a, 11015c0c0, > ffffffff7fffeca0, 100136240) + 2bc > 000000010000f168 sortlines_thread (ffffffff7e1fbdf0, 1fc000, 0, 0, > 10000f104, 0) + 64 > ffffffff7ead8778 _lwp_start (0, 0, 0, 0, 0, 0) Looks like a deadlock, but may be triggered by stack corruption, as the failure modes vary. Would it be possible to annotate lock_node() with that attached. This should verify we're at least not missing an unlock() somewhere. You can then capture the annotations by adding '2> locks' at the end of the command. thanks, Pádraig.
diff --git a/src/sort.c b/src/sort.c index 7410abc..9d156ac 100644 --- a/src/sort.c +++ b/src/sort.c @@ -3196,6 +3196,7 @@ merge_tree_init (size_t nthreads, size_t nlines, struct line *dest) root->parent = NULL; root->level = MERGE_END; root->queued = false; + fprintf (stderr, "merge_tree_init init_node %p\n", root); pthread_mutex_init (&root->lock, NULL); init_node (root, root + 1, dest, nthreads, nlines, false); @@ -3238,6 +3239,7 @@ init_node (struct merge_node *restrict parent, node->parent = parent; node->level = parent->level + 1; node->queued = false; + fprintf (stderr, "init_node init_node %p\n", node); pthread_mutex_init (&node->lock, NULL); if (nthreads > 1) @@ -3335,6 +3337,7 @@ queue_pop (struct merge_node_queue *queue) while (! (node = heap_remove_top (queue->priority_queue))) pthread_cond_wait (&queue->cond, &queue->mutex); pthread_mutex_unlock (&queue->mutex); + fprintf (stderr, "queue_pop lock_node %p\n", node); lock_node (node); node->queued = false; return node; @@ -3455,8 +3458,10 @@ queue_check_insert_parent (struct merge_node_queue *queue, { if (node->level > MERGE_ROOT) { + fprintf (stderr, "queue_check_insert_parent lock_node %p\n", node->parent); lock_node (node->parent); queue_check_insert (queue, node->parent); + fprintf (stderr, "queue_check_insert_parent unlock_node %p\n", node->parent); unlock_node (node->parent); } else if (node->nlo + node->nhi == 0) @@ -3483,6 +3488,7 @@ merge_loop (struct merge_node_queue *queue, if (node->level == MERGE_END) { + fprintf (stderr, "merge_loop_MERGE_END unlock_node %p\n", node); unlock_node (node); /* Reinsert so other threads can pop it. */ queue_insert (queue, node); @@ -3492,6 +3498,7 @@ merge_loop (struct merge_node_queue *queue, queue_check_insert (queue, node); queue_check_insert_parent (queue, node); + fprintf (stderr, "merge_loop unlock_node %p\n", node); unlock_node (node); } } @@ -3608,6 +3615,7 @@ sortlines (struct line *restrict lines, size_t nthreads, merge_loop (queue, total_lines, tfp, temp_output); } + fprintf (stderr, "sortlines destroy_node %p\n", node); pthread_mutex_destroy (&node->lock); } @@ -3917,6 +3925,7 @@ sort (char *const *files, size_t nfiles, char const *output_file, sortlines (line, nthreads, buf.nlines, root, &queue, tfp, temp_output); queue_destroy (&queue); + fprintf (stderr, "sort destroy_node %p\n", root); pthread_mutex_destroy (&root->lock); merge_tree_destroy (merge_tree); }