On 03/13/2013 02:18 PM, McFarland, Jeffrey wrote:
> Here are the values from another sort that has been running for over 12 hours
> now. This time that second argument (number of threads) looks fine in all
> three cases. And this time there are no zombie threads.
>
>> : pstack 20632
> 20632: /usr/local/abacus/etsort/sort -tn -S 295063 --batch-size=100 -T
> /disk/
> ----------------- lwp# 1 / thread# 1 --------------------
> ffffffff7eadc810 lwp_wait (f2, ffffffff7fffea9c)
> ffffffff7ead4d74 _thrp_join (f2, 0, 0, 1, ffffffff7fffeca0,
> ffffffff7fffea9c) + 38
> 000000010000f2f4 sortlines (110137e90, 8, 7194a, 11015bfe0,
> ffffffff7fffeca0, 100136240) + 174
> 0000000100010144 sort (100137cd0, 1, ffffffff7ffff660, 8, ffffffff7fffeeac,
> ffffffff7ed00200) + 2f0
> 0000000100012bf4 main (13, ffffffff7ffff1f8, ffffffff7ffff298, 100136ca8,
> 100000000, ffffffff7ed00200) + 21cc
> 0000000100004ca4 _start (0, 0, 0, 0, 0, 0) + 7c
> ----------------- lwp# 242 / thread# 242 --------------------
> ffffffff7eadc810 lwp_wait (f4, ffffffff7e1fbd2c)
> ffffffff7ead4d74 _thrp_join (f4, 0, 0, 1, ffffffff7fffeca0,
> ffffffff7e1fbd2c) + 38
> 000000010000f2f4 sortlines (110137e90, 4, 7194a, 11015c050,
> ffffffff7fffeca0, 100136240) + 174
> 000000010000f168 sortlines_thread (ffffffff7fffeb60, 1fc000, 0, 0,
> 10000f104, 0) + 64
> ffffffff7ead8778 _lwp_start (0, 0, 0, 0, 0, 0)
> ----------------- lwp# 244 / thread# 244 --------------------
> ffffffff7ead8818 lwp_park (0, 0, 0)
> 000000010000e710 lock_node (11015c360, 10f691fb0, ffffffff7ec4a300,
> ffffffff7fffecac, ffffffff7ed00a00, 0) + 14
> 000000010000efbc queue_check_insert_parent (ffffffff7fffeca0, 11015c3d0,
> 100136240, 1101597dd, ffffffff7ed00a00, 1c00) + 2c
> 000000010000f0e8 merge_loop (ffffffff7fffeca0, 7194a, 100136240, 1101597dd,
> ffffffff7eacff0c, 3) + 90
> 000000010000f43c sortlines (110137e90, 2, 7194a, 11015c0c0,
> ffffffff7fffeca0, 100136240) + 2bc
> 000000010000f168 sortlines_thread (ffffffff7e1fbdf0, 1fc000, 0, 0,
> 10000f104, 0) + 64
> ffffffff7ead8778 _lwp_start (0, 0, 0, 0, 0, 0)
Looks like a deadlock, but may be triggered by stack corruption,
as the failure modes vary.
Would it be possible to annotate lock_node() with that attached.
This should verify we're at least not missing an unlock() somewhere.
You can then capture the annotations by adding '2> locks' at the end of the
command.
thanks,
Pádraig.
diff --git a/src/sort.c b/src/sort.c
index 7410abc..9d156ac 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -3196,6 +3196,7 @@ merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
root->parent = NULL;
root->level = MERGE_END;
root->queued = false;
+ fprintf (stderr, "merge_tree_init init_node %p\n", root);
pthread_mutex_init (&root->lock, NULL);
init_node (root, root + 1, dest, nthreads, nlines, false);
@@ -3238,6 +3239,7 @@ init_node (struct merge_node *restrict parent,
node->parent = parent;
node->level = parent->level + 1;
node->queued = false;
+ fprintf (stderr, "init_node init_node %p\n", node);
pthread_mutex_init (&node->lock, NULL);
if (nthreads > 1)
@@ -3335,6 +3337,7 @@ queue_pop (struct merge_node_queue *queue)
while (! (node = heap_remove_top (queue->priority_queue)))
pthread_cond_wait (&queue->cond, &queue->mutex);
pthread_mutex_unlock (&queue->mutex);
+ fprintf (stderr, "queue_pop lock_node %p\n", node);
lock_node (node);
node->queued = false;
return node;
@@ -3455,8 +3458,10 @@ queue_check_insert_parent (struct merge_node_queue *queue,
{
if (node->level > MERGE_ROOT)
{
+ fprintf (stderr, "queue_check_insert_parent lock_node %p\n", node->parent);
lock_node (node->parent);
queue_check_insert (queue, node->parent);
+ fprintf (stderr, "queue_check_insert_parent unlock_node %p\n", node->parent);
unlock_node (node->parent);
}
else if (node->nlo + node->nhi == 0)
@@ -3483,6 +3488,7 @@ merge_loop (struct merge_node_queue *queue,
if (node->level == MERGE_END)
{
+ fprintf (stderr, "merge_loop_MERGE_END unlock_node %p\n", node);
unlock_node (node);
/* Reinsert so other threads can pop it. */
queue_insert (queue, node);
@@ -3492,6 +3498,7 @@ merge_loop (struct merge_node_queue *queue,
queue_check_insert (queue, node);
queue_check_insert_parent (queue, node);
+ fprintf (stderr, "merge_loop unlock_node %p\n", node);
unlock_node (node);
}
}
@@ -3608,6 +3615,7 @@ sortlines (struct line *restrict lines, size_t nthreads,
merge_loop (queue, total_lines, tfp, temp_output);
}
+ fprintf (stderr, "sortlines destroy_node %p\n", node);
pthread_mutex_destroy (&node->lock);
}
@@ -3917,6 +3925,7 @@ sort (char *const *files, size_t nfiles, char const *output_file,
sortlines (line, nthreads, buf.nlines, root,
&queue, tfp, temp_output);
queue_destroy (&queue);
+ fprintf (stderr, "sort destroy_node %p\n", root);
pthread_mutex_destroy (&root->lock);
merge_tree_destroy (merge_tree);
}