On 03/13/2013 02:18 PM, McFarland, Jeffrey wrote:
> Here are the values from another sort that has been running for over 12 hours 
> now.  This time that second argument (number of threads) looks fine in all 
> three cases.  And this time there are no zombie threads.
> 
>> : pstack 20632
> 20632:  /usr/local/abacus/etsort/sort -tn -S 295063 --batch-size=100  -T 
> /disk/
> -----------------  lwp# 1 / thread# 1  --------------------
>  ffffffff7eadc810 lwp_wait (f2, ffffffff7fffea9c)
>  ffffffff7ead4d74 _thrp_join (f2, 0, 0, 1, ffffffff7fffeca0, 
> ffffffff7fffea9c) + 38
>  000000010000f2f4 sortlines (110137e90, 8, 7194a, 11015bfe0, 
> ffffffff7fffeca0, 100136240) + 174
>  0000000100010144 sort (100137cd0, 1, ffffffff7ffff660, 8, ffffffff7fffeeac, 
> ffffffff7ed00200) + 2f0
>  0000000100012bf4 main (13, ffffffff7ffff1f8, ffffffff7ffff298, 100136ca8, 
> 100000000, ffffffff7ed00200) + 21cc
>  0000000100004ca4 _start (0, 0, 0, 0, 0, 0) + 7c
> -----------------  lwp# 242 / thread# 242  --------------------
>  ffffffff7eadc810 lwp_wait (f4, ffffffff7e1fbd2c)
>  ffffffff7ead4d74 _thrp_join (f4, 0, 0, 1, ffffffff7fffeca0, 
> ffffffff7e1fbd2c) + 38
>  000000010000f2f4 sortlines (110137e90, 4, 7194a, 11015c050, 
> ffffffff7fffeca0, 100136240) + 174
>  000000010000f168 sortlines_thread (ffffffff7fffeb60, 1fc000, 0, 0, 
> 10000f104, 0) + 64
>  ffffffff7ead8778 _lwp_start (0, 0, 0, 0, 0, 0)


> -----------------  lwp# 244 / thread# 244  --------------------
>  ffffffff7ead8818 lwp_park (0, 0, 0)
>  000000010000e710 lock_node (11015c360, 10f691fb0, ffffffff7ec4a300, 
> ffffffff7fffecac, ffffffff7ed00a00, 0) + 14
>  000000010000efbc queue_check_insert_parent (ffffffff7fffeca0, 11015c3d0, 
> 100136240, 1101597dd, ffffffff7ed00a00, 1c00) + 2c
>  000000010000f0e8 merge_loop (ffffffff7fffeca0, 7194a, 100136240, 1101597dd, 
> ffffffff7eacff0c, 3) + 90
>  000000010000f43c sortlines (110137e90, 2, 7194a, 11015c0c0, 
> ffffffff7fffeca0, 100136240) + 2bc
>  000000010000f168 sortlines_thread (ffffffff7e1fbdf0, 1fc000, 0, 0, 
> 10000f104, 0) + 64
>  ffffffff7ead8778 _lwp_start (0, 0, 0, 0, 0, 0)

Looks like a deadlock, but may be triggered by stack corruption,
as the failure modes vary.
Would it be possible to annotate lock_node() with that attached.
This should verify we're at least not missing an unlock() somewhere.
You can then capture the annotations by adding '2> locks' at the end of the 
command.

thanks,
Pádraig.
diff --git a/src/sort.c b/src/sort.c
index 7410abc..9d156ac 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -3196,6 +3196,7 @@ merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
   root->parent = NULL;
   root->level = MERGE_END;
   root->queued = false;
+  fprintf (stderr, "merge_tree_init init_node %p\n", root);
   pthread_mutex_init (&root->lock, NULL);
 
   init_node (root, root + 1, dest, nthreads, nlines, false);
@@ -3238,6 +3239,7 @@ init_node (struct merge_node *restrict parent,
   node->parent = parent;
   node->level = parent->level + 1;
   node->queued = false;
+  fprintf (stderr, "init_node init_node %p\n", node);
   pthread_mutex_init (&node->lock, NULL);
 
   if (nthreads > 1)
@@ -3335,6 +3337,7 @@ queue_pop (struct merge_node_queue *queue)
   while (! (node = heap_remove_top (queue->priority_queue)))
     pthread_cond_wait (&queue->cond, &queue->mutex);
   pthread_mutex_unlock (&queue->mutex);
+  fprintf (stderr, "queue_pop lock_node %p\n", node);
   lock_node (node);
   node->queued = false;
   return node;
@@ -3455,8 +3458,10 @@ queue_check_insert_parent (struct merge_node_queue *queue,
 {
   if (node->level > MERGE_ROOT)
     {
+      fprintf (stderr, "queue_check_insert_parent lock_node %p\n", node->parent);
       lock_node (node->parent);
       queue_check_insert (queue, node->parent);
+      fprintf (stderr, "queue_check_insert_parent unlock_node %p\n", node->parent);
       unlock_node (node->parent);
     }
   else if (node->nlo + node->nhi == 0)
@@ -3483,6 +3488,7 @@ merge_loop (struct merge_node_queue *queue,
 
       if (node->level == MERGE_END)
         {
+          fprintf (stderr, "merge_loop_MERGE_END unlock_node %p\n", node);
           unlock_node (node);
           /* Reinsert so other threads can pop it. */
           queue_insert (queue, node);
@@ -3492,6 +3498,7 @@ merge_loop (struct merge_node_queue *queue,
       queue_check_insert (queue, node);
       queue_check_insert_parent (queue, node);
 
+      fprintf (stderr, "merge_loop unlock_node %p\n", node);
       unlock_node (node);
     }
 }
@@ -3608,6 +3615,7 @@ sortlines (struct line *restrict lines, size_t nthreads,
       merge_loop (queue, total_lines, tfp, temp_output);
     }
 
+  fprintf (stderr, "sortlines destroy_node %p\n", node);
   pthread_mutex_destroy (&node->lock);
 }
 
@@ -3917,6 +3925,7 @@ sort (char *const *files, size_t nfiles, char const *output_file,
               sortlines (line, nthreads, buf.nlines, root,
                          &queue, tfp, temp_output);
               queue_destroy (&queue);
+              fprintf (stderr, "sort destroy_node %p\n", root);
               pthread_mutex_destroy (&root->lock);
               merge_tree_destroy (merge_tree);
             }

Reply via email to