Mostyn,There was a problem with the SM BTL. Please upgrade to at least 19315 and [hopefully] your application will run to completion.
Thanks, george. On Jul 24, 2008, at 3:39 AM, Mostyn Lewis wrote:
Hello,Using a very recent svn version (1.4a1r18899) I'm getting a non- terminatingcondition if I use the sm btl with tcp,self or with openib,self.The program is not finishing a reduce operation. It works if the sm btlis left out. Using 2 4 core nodes. Program is: ----------------------------------------------------------------------------- /* * Copyright (c) 2001-2002 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 1998-2001 University of Notre Dame. * All rights reserved. * Copyright (c) 1994-1998 The Ohio State University. * All rights reserved. * * This file is part of the LAM/MPI software package. For license * information, see the LICENSE file in the top level directory of the * LAM/MPI source distribution. * * $HEADER$ * * $Id: cpi.c,v 1.4 2002/11/23 04:06:58 jsquyres Exp $ * * Portions taken from the MPICH distribution example cpi.c. * * Example program to calculate the value of pi by integrating f(x) = * 4 / (1 + x^2). */ #include <stdio.h> #include <sys/types.h> #include <unistd.h> #include <math.h> #include <mpi.h> /* Constant for how many values we'll estimate */ #define NUM_ITERS 1000 /* Prototype the function that we'll use below. */ static double f(double); int main(int argc, char *argv[]) { int iter, rank, size, i; double PI25DT = 3.141592653589793238462643; double mypi, pi, h, sum, x; double startwtime = 0.0, endwtime; int namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; pid_t pid; pid = getpid(); char foo[200]; /* Normal MPI startup */ MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(processor_name, &namelen); /* if(rank == 7){ */ printf("Process %d of %d on %s\n", rank, size, processor_name); /* system("set"); */ /* sprintf(foo,"%s %d","/tools/linux/bin/cpu -o -p ",pid); system(foo); */ /* } */ /* Do approximations for 1 to 100 points */ /* sleep(5); */ for (iter = 2; iter < NUM_ITERS; ++iter) { h = 1.0 / (double) iter; sum = 0.0; /* A slightly better approach starts from large i and works back */ if (rank == 0) startwtime = MPI_Wtime(); for (i = rank + 1; i <= iter; i += size) { x = h * ((double) i - 0.5); sum += f(x); } mypi = h * sum; MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); /* if (rank == 0) {MPI_Reduce(MPI_IN_PLACE, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);} else {MPI_Reduce(&mypi, NULL, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);} */ /* if (rank == 0) { printf("%d points: pi is approximately %.16f, error = %.16f\n", iter, pi, fabs(pi - PI25DT)); endwtime = MPI_Wtime(); printf("wall clock time = %f\n", endwtime - startwtime); fflush(stdout); } */ } /* All done */ MPI_Finalize(); return 0; } static double f(double a) { return (4.0 / (1.0 + a * a));} ----------------------------------------------------------------------------- A "good" non sm btl run (--mca btl tcp,self --mca opal_event_include select) givesProcess 1 of 8 on r4150_18 Process 3 of 8 on r4150_18 Process 5 of 8 on r4150_17 Process 4 of 8 on r4150_17 Process 7 of 8 on r4150_17 Process 6 of 8 on r4150_17 Process 0 of 8 on r4150_18 Process 2 of 8 on r4150_18A "bad" sm btl run (--mca btl sm,tcp,self --mca opal_event_include select)When using gdb to attach a non-terminating process shows: (gdb) where #0 0x000000366d4c78d3 in __select_nocancel () from /lib64/libc.so.6#1 0x00002aaaab076546 in select_dispatch (base=0x278bc00, arg=0x278bb50, tv=0x7fff8bd179a0)at ../../.././opal/event/select.c:176#2 0x00002aaaab073308 in opal_event_base_loop (base=0x278bc00, flags=2)at ../../.././opal/event/event.c:803#3 0x00002aaaab073004 in opal_event_loop (flags=2) at ../../.././ opal/event/event.c:726 #4 0x00002aaaab0636a4 in opal_progress () at ../.././opal/runtime/ opal_progress.c:189 #5 0x00002aaaaaaf9c23 in opal_condition_wait (c=0x2aaaaadd7bc0, m=0x2aaaaadd7c20)at ../.././opal/threads/condition.h:100#6 0x00002aaaaaafa1bd in ompi_request_default_wait_all (count=1, requests=0x7fff8bd17b50,statuses=0x0) at ../.././ompi/request/req_wait.c:262#7 0x00002aaaaf0f913e in ompi_coll_tuned_reduce_generic (sendbuf=0x7fff8bd180d0, recvbuf=0x7fff8bd180c8, original_count=1, datatype=0x6012b0, op=0x6016a0, root=0, comm=0x601440, module=0x2832610, tree=0x28331c0, count_by_segment=1, max_outstanding_reqs=0)at ../../../../.././ompi/mca/coll/tuned/coll_tuned_reduce.c:168#8 0x00002aaaaf0f9fcd in ompi_coll_tuned_reduce_intra_binomial (sendbuf=0x7fff8bd180d0, recvbuf=0x7fff8bd180c8, count=1, datatype=0x6012b0, op=0x6016a0, root=0, comm=0x601440,module=0x2832610, segsize=0, max_outstanding_reqs=0) at ../../../../.././ompi/mca/coll/tuned/coll_tuned_reduce.c:462#9 0x00002aaaaf0ea075 in ompi_coll_tuned_reduce_intra_dec_fixed (sendbuf=0x7fff8bd180d0, recvbuf=0x7fff8bd180c8, count=1, datatype=0x6012b0, op=0x6016a0, root=0, comm=0x601440, module=0x2832610) at ../../../../.././ompi/mca/coll/tuned/ coll_tuned_decision_fixed.c:389 #10 0x00002aaaaab41348 in PMPI_Reduce (sendbuf=0x7fff8bd180d0, recvbuf=0x7fff8bd180c8, count=1, datatype=0x6012b0, op=0x6016a0, root=0, comm=0x601440) at preduce.c:105#11 0x0000000000400bc4 in main () opal/event/select.c:176 is: res = select(sop->event_fds + 1, sop->event_readset_out, sop->event_writeset_out, NULL, tv); opal/event/event.c:803 is: res = evsel->dispatch(base, evbase, tv_p); opal/event/event.c:726 is: return event_base_loop(current_base, flags); opal/runtime/opal_progress.c:189 is: events += opal_event_loop(opal_progress_event_flag); opal/threads/condition.h:100 is: while (c->c_signaled == 0) { opal_progress(); <----------------------------------100 OPAL_CR_TEST_CHECKPOINT_READY_STALL(); } ompi/request/req_wait.c:262 is: while (pending > ompi_request_completed - start) {opal_condition_wait(&ompi_request_cond, &ompi_request_lock); <-----262} I'll stop at this point. If the default poll is used there's a similar non-terminating loop. Any ideas at this point? Regards, Mostyn _______________________________________________ users mailing list us...@open-mpi.org http://www.open-mpi.org/mailman/listinfo.cgi/users
smime.p7s
Description: S/MIME cryptographic signature